Working on new constraint algorithm

6fd7f3bb · Peter Eastman · f23db9f1 · 6fd7f3bb · 6fd7f3bb · 6fd7f3bb
Commit 6fd7f3bb authored May 21, 2009 by Peter Eastman
17 changed files
--- a/jama/include/tnt_i_refvec.h
+++ b/jama/include/tnt_i_refvec.h
--- a/jama/include/tnt_math_utils.h
+++ b/jama/include/tnt_math_utils.h
--- a/jama/include/tnt_sparse_matrix_csr.h
+++ b/jama/include/tnt_sparse_matrix_csr.h
--- a/jama/include/tnt_stopwatch.h
+++ b/jama/include/tnt_stopwatch.h
--- a/jama/include/tnt_subscript.h
+++ b/jama/include/tnt_subscript.h
--- a/jama/include/tnt_vec.h
+++ b/jama/include/tnt_vec.h
--- a/jama/include/tnt_version.h
+++ b/jama/include/tnt_version.h
--- a/libraries/quern/README
+++ b/libraries/quern/README
+Quern 0.9 (beta)
+Public domain code for sparse up-looking Givens QR, including incomplete
+drop tolerance version for preconditioning iterative solvers.
+Edit Makefile to fit your set up (in particular, MATLAB-specific stuff).
+Currently it's set up for debugging; change the flags to get optimized
+versions of the code.
+In addition to the library libquern.a (with header quern.h), there are
+two mex files (quern_internal_mex and quern_cgnr_mex) as well as a
+stand-alone executable test_mat for running tests on matrices stored
+in .mat files.
+- Robert Bridson
+  UBC Computer Science
+  February 4, 2009
--- a/libraries/quern/include/quern.h
+++ b/libraries/quern/include/quern.h
+/*
+* Quern: a sparse QR library.
+* This code is in the public domain.
+* - Robert Bridson
+*/
+#ifndef QUERN_H
+#define QUERN_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+// Function return values: nonzero indicates an error
+#define QUERN_OK 0
+#define QUERN_INPUT_ERROR 1
+#define QUERN_OUT_OF_MEMORY 2
+// Takes a compressed sparse column format matrix and outputs it in compressed
+// sparse row format (suitable for the QR factorization routines).
+// Requires A1_row_start to have room for num_columns+1 entries,
+//          A1_column_index to have room for all nonzeros,
+//      and A1_value to have room for all nonzeros.
+int QUERN_convert_column_format_to_row_format(int num_rows,
+                                              int num_columns,
+                                              const int* A0_column_start,
+                                              const int* A0_row_index,
+                                              const double* A0_value,
+                                              int* A1_row_start,
+                                              int* A1_column_index,
+                                              double* A1_value);
+// Takes a compressed sparse row format m*n matrix and finds a reversed
+// breadth-first search column ordering, suitable for incomplete QR
+// preconditioning.
+// Requires column_order to have room for n entries.
+int QUERN_get_rbfs_column_ordering(int m,
+                                   int n,
+                                   const int* A_row_start,
+                                   const int* A_column_index,
+                                   int* column_order);
+// Takes a length n column ordering and a compressed sparse row format m*n
+// matrix, then reorders each row accordingly.
+int QUERN_reorder_columns(int m,
+                          int n,
+                          const int* column_order,
+                          const int* A_row_start,
+                          int* A_column_index,
+                          double* A_value);
+// Takes a compressed sparse row format m*n matrix and finds a row ordering
+// which minimizes the lower profile---in particular, if A's rows can be
+// permuted to make it upper triangular, row_order will do the trick.
+// This is recommended as a cheap pre-process before QR, at least if nothing
+// is known about the current structure, to improve performance.
+// Requires row_order to have room for m integers.
+int QUERN_get_profile_row_ordering(int m,
+                                   int n,
+                                   const int* A_row_start,
+                                   const int* A_column_index,
+                                   int* row_order);
+// Reorders a given input vector x into an output vector y of length m.
+int QUERN_reorder_vector(int m,
+                         const int* order,
+                         const double* x,
+                         double* y);
+// Applies the inverse order to a given input vector x, saving in an output
+// vector y of length m.
+int QUERN_inverse_order_vector(int m,
+                               const int* order,
+                               const double* x,
+                               double* y);
+// Takes a compressed sparse row format m*n matrix (m>=n) and outputs the upper
+// triangular R factor from QR, also in compressed sparse row format. If
+// row_order is non-null, it should contain the order in which rows of A should be
+// taken; if it is null, the natural ordering is used. Memory for R is
+// allocated internally; use QUERN_free_result to free.
+int QUERN_compute_qr_without_q(int m,
+                               int n,
+                               const int* A_row_start,
+                               const int* A_column_index,
+                               const double* A_value,
+                               const int* row_order,
+                               int** ptr_R_row_start,
+                               int** ptr_R_column_index,
+                               double** ptr_R_value);
+// Takes a compressed sparse row format m*n matrix (m>=n) and outputs the QR
+// factors. The storage for Q encodes a sequence of Givens rotations and row
+// swaps; it is not directly usable as a standard sparse matrix. R, however,
+// is stored in standard compressed sparse row format. If row_order is non-null
+// it should contain the order in which rows of A should be taken; if it is
+// null, the natural ordering is used. Memory for Q and R is allocated
+// internally; use QUERN_free_result to free.
+int QUERN_compute_qr(int m,
+                     int n,
+                     const int* A_row_start,
+                     const int* A_column_index,
+                     const double* A_value,
+                     const int* row_order,
+                     int** ptr_Q_row_start,
+                     int** ptr_Q_column_index,
+                     double** ptr_Q_value,
+                     int** ptr_R_row_start,
+                     int** ptr_R_column_index,
+                     double** ptr_R_value);
+// Takes a compressed sparse row format m*n matrix (m>=n) and outputs the upper
+// triangular R factor from QR, with small entries dropped, also in compressed
+// sparse row format. If row_order is non-null, it should contain the order in
+// which rows of A should be taken; if it is null, the natural ordering is used.
+// Memory for R is allocated internally; use QUERN_free_result to free.
+int QUERN_compute_incomplete_qr_without_q(int m,
+                                          int n,
+                                          const int* A_row_start,
+                                          const int* A_column_index,
+                                          const double* A_value,
+                                          const int* row_order,
+                                          double drop_tolerance,
+                                          int** ptr_R_row_start,
+                                          int** ptr_R_column_index,
+                                          double** ptr_R_value);
+// Takes a compressed sparse row format m*n matrix (m>=n) and outputs incomplete
+// QR factors. The storage for Q encodes a sequence of Givens rotations and row
+// swaps; it is not directly usable as a standard sparse matrix. R, however,
+// is stored in standard compressed sparse row format. If row_order is non-null
+// it should contain the order in which rows of A should be taken; if it is
+// null, the natural ordering is used. Memory for Q and R is allocated
+// internally; use QUERN_free_result to free.
+int QUERN_compute_incomplete_qr(int m,
+                                int n,
+                                const int* A_row_start,
+                                const int* A_column_index,
+                                const double* A_value,
+                                const int* row_order,
+                                double drop_tolerance,
+                                int** ptr_Q_row_start,
+                                int** ptr_Q_column_index,
+                                double** ptr_Q_value,
+                                int** ptr_R_row_start,
+                                int** ptr_R_column_index,
+                                double** ptr_R_value);
+// Free the memory allocated during QR factorization (for either Q or R).
+// After calling this, do not try to access the factor again!
+void QUERN_free_result(int* row_start,
+                       int* column_index,
+                       double* value);
+// Compute the product of an m*n CSR matrix with n-vector input in
+// m-vector result.
+int QUERN_multiply(int m,
+                   int n,
+                   const int* row_start,
+                   const int* column_index,
+                   const double* value,
+                   const double* input,
+                   double* result);
+// Compute the product of an m*n CSR matrix transposes with m-vector input in
+// n-vector result.
+int QUERN_multiply_transpose(int m,
+                             int n,
+                             const int* row_start,
+                             const int* column_index,
+                             const double* value,
+                             const double* input,
+                             double* result);
+// Compute the product Q*x in-place (Q from QR factorization).
+// Note: if the input is naturally a length n vector, you should pad it with
+// zeros to make it length m.
+// If a row reordering of A was involved, you probably want to
+// call QUERN_inverse_order_vector afterwards.
+int QUERN_multiply_with_q(int m,
+                          const int* Q_row_start,
+                          const int* Q_column_index,
+                          const double* Q_value,
+                          double* x);
+// Compute the product Q^T*x in-place (Q from QR factorization).
+// If a row reordering of A was involved, you probably want to
+// call QUERN_reorder_vector first.
+int QUERN_multiply_with_q_transpose(int m,
+                                    const int* Q_row_start,
+                                    const int* Q_column_index,
+                                    const double* Q_value,
+                                    double* x);
+// Compute result=R^{-1}*rhs (R from QR factorization).
+// The vector result may actually be aliased to rhs, for an in-place solve.
+int QUERN_solve_with_r(int n,
+                       const int* R_row_start,
+                       const int* R_column_index,
+                       const double* R_value,
+                       const double* rhs,
+                       double* result);
+// Compute R^{-T}*x in-place (R from QR factorization).
+int QUERN_solve_with_r_transpose_in_place(int n,
+                                          const int* R_row_start,
+                                          const int* R_column_index,
+                                          const double* R_value,
+                                          double* x);
+// CGNR solver for A^T*A*x=rhs, starting with zero initial guess,
+// with R as preconditioner.
+int QUERN_solve_with_CGNR(int m,
+                          int n,
+                          const int* A_row_start,
+                          const int* A_column_index,
+                          const double* A_value,
+                          const double* rhs,
+                          const int* R_row_start,
+                          const int* R_column_index,
+                          const double* R_value,
+                          int max_iterations,
+                          double absolute_convergence_tolerance,
+                          double* x,
+                          int* return_solved,
+                          int* return_iterations,
+                          double* return_residual_norm);
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/libraries/quern/include/quern_list.h
+++ b/libraries/quern/include/quern_list.h
+#ifndef QUERN_LIST_H
+#define QUERN_LIST_H
+#include <algorithm>
+#include <cstdlib>
+// This is a light-weight singly linked list with half-decent
+// allocation, suitable only for plain-old-data, and designed
+// to make it easy to use robustly from plain C (e.g. no
+// exceptions, basic memory management with malloc/free and
+// returned error codes).
+namespace Quern {
+template<typename T>
+struct ListEntry
+{
+   T data;
+   ListEntry<T>* next;
+   ListEntry(void) {}
+   ListEntry(const T& data_, ListEntry<T>* next_=0)
+      : data(data_), next(next_) {}
+};
+const int puddle_size=1020;
+template<typename T>
+struct Puddle
+{
+   int end;
+   Puddle* next;
+   ListEntry<T> array[puddle_size];
+};
+template<typename T>
+struct Pool
+{
+   Puddle<T>* puddle_head;
+   ListEntry<T>* free_head;
+   Pool(void) : puddle_head(0), free_head(0) {}
+   ~Pool(void)
+   {
+      Puddle<T>* p=puddle_head;
+      while(p){
+         Puddle<T>* p_next=p->next;
+         std::free(p);
+         p=p_next;
+      }
+      puddle_head=0;
+      free_head=0;
+   }
+   ListEntry<T>* allocate(void)
+   {
+      // if we just have something free at hand, use it
+      if(free_head){
+         ListEntry<T>* p=free_head;
+         free_head=free_head->next;
+         return p;
+      }
+      // otherwise, if we know we need another puddle in the pool, try for it
+      if(!puddle_head || puddle_head->end==puddle_size){
+         Puddle<T>* new_puddle=(Puddle<T>*)std::malloc(sizeof(Puddle<T>));
+         if(!new_puddle) return 0;
+         new_puddle->end=0;
+         new_puddle->next=puddle_head;
+         puddle_head=new_puddle;
+      }
+      return &puddle_head->array[puddle_head->end++];
+   }
+   void release(ListEntry<T>* entry)
+   {
+      entry->next=free_head;
+      free_head=entry;
+   }
+};
+template<typename T>
+struct ListIterator
+{
+   ListEntry<T>** handle; // pointer to a "next" or "head" pointer in the list
+   ListIterator(ListEntry<T>** handle_=0)
+      : handle(handle_)
+   {}
+   bool still_going(void) const
+   {
+      assert(handle);
+      return *handle!=0;
+   }
+   void operator++(void)
+   {
+      assert(handle);
+      assert(*handle);
+      handle=&((*handle)->next);
+   }
+   T& operator*(void)
+   {
+      assert(handle);
+      assert(*handle);
+      return (*handle)->data;
+   }
+   T* operator->(void)
+   {
+      assert(handle);
+      assert(*handle);
+      return &(*handle)->data;
+   }
+   const T* operator->(void) const
+   {
+      assert(handle);
+      assert(*handle);
+      return &(*handle)->data;
+   }
+   const T& operator*(void) const
+   {
+      assert(handle);
+      assert(*handle);
+      return (*handle)->data;
+   }
+};
+template<typename T>
+struct List
+{
+   Pool<T>* pool;
+   ListEntry<T>* head;
+   int length;
+   List(void) {}
+   void init(Pool<T>* pool_)
+   {
+      pool=pool_;
+      head=0;
+      length=0;
+   }
+   ListIterator<T> begin(void)
+   {
+      return ListIterator<T>(&head);
+   }
+   bool empty(void)
+   {
+      return head==0;
+   }
+   // p ends up referring to the next element
+   void erase(ListIterator<T>& p)
+   {
+      assert(p.handle);
+      assert(*p.handle);
+      ListEntry<T>* q=*p.handle;
+      *p.handle=q->next;
+      pool->release(q);
+      --length;
+   }
+   T& front(void)
+   {
+      assert(head);
+      return head->data;
+   }
+   const T& front(void) const
+   {
+      assert(head);
+      return head->data;
+   }
+   // put the new element before the one p initially refers to;
+   // if successful, p ends up referring to the new element
+   bool insert(ListIterator<T>& p, const T& data)
+   {
+      ListEntry<T>* q=pool->allocate();
+      if(!q) return false;
+      q->next=*p.handle;
+      q->data=data;
+      *p.handle=q;
+      ++length;
+      return true;
+   }
+   void pop_front(void)
+   {
+      assert(head);
+      ListEntry<T>* p=head;
+      head=p->next;
+      pool->release(p);
+      --length;
+   }
+   bool push_front(const T& entry)
+   {
+      ListEntry<T>* p=pool->allocate();
+      if(!p) return false;
+      p->data=entry;
+      p->next=head;
+      head=p;
+      ++length;
+      return true;
+   }
+   int size(void)
+   {
+      return length;
+   }
+   void swap(List<T>& other)
+   {
+      assert(pool==other.pool);
+      std::swap(head, other.head);
+      std::swap(length, other.length);
+   }
+};
+} // end namespace
+#endif
--- a/libraries/quern/src/quern.cpp
+++ b/libraries/quern/src/quern.cpp
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include "quern.h"
+int QUERN_convert_column_format_to_row_format(int num_rows,
+                                              int num_columns,
+                                              const int* A0_column_start,
+                                              const int* A0_row_index,
+                                              const double* A0_value,
+                                              int* A1_row_start,
+                                              int* A1_column_index,
+                                              double* A1_value)
+{
+   // check input
+   if(num_rows<=0 || num_columns<=0) return QUERN_INPUT_ERROR;
+   if(!A0_column_start || !A0_row_index || !A0_value) return QUERN_INPUT_ERROR;
+   if(!A1_row_start || !A1_column_index || !A1_value) return QUERN_INPUT_ERROR;
+   // figure out number of entries in each row
+   std::memset(A1_row_start, 0, (num_rows+1)*sizeof(int));
+   for(int i=0; i<num_columns; ++i){
+      if(A0_column_start[i]>A0_column_start[i+1]) return QUERN_INPUT_ERROR;
+      for(int j=A0_column_start[i]; j<A0_column_start[i+1]; ++j){
+         if(A0_row_index[j]<0 || A0_row_index[j]>=num_rows)
+            return QUERN_INPUT_ERROR;
+         ++A1_row_start[A0_row_index[j]+1];
+      }
+   }
+   // cumulative sum to get row_start
+   for(int i=0; i<num_rows; ++i)
+      A1_row_start[i+1]+=A1_row_start[i];
+   // use a temporary copy of row_start for keeping track of where we add
+   int* row_pointer=(int*)std::malloc(num_rows*sizeof(int));
+   if(!row_pointer) return QUERN_OUT_OF_MEMORY;
+   std::memcpy(row_pointer, A1_row_start, num_rows*sizeof(int));
+   // then fill in the entries
+   for(int i=0; i<num_columns; ++i){
+      for(int j=A0_column_start[i]; j<A0_column_start[i+1]; ++j){
+         int r=A0_row_index[j];
+         A1_column_index[row_pointer[r]]=i;
+         A1_value[row_pointer[r]]=A0_value[j];
+         ++row_pointer[r];
+      }
+   }
+   std::free(row_pointer);
+   return QUERN_OK;
+}
+int QUERN_multiply(int m,
+                   int n,
+                   const int* row_start,
+                   const int* column_index,
+                   const double* value,
+                   const double* input,
+                   double* result)
+{
+   if(m<=0 || n<=0 || !row_start || !column_index || !value
+         || !input || !result)
+      return QUERN_INPUT_ERROR;
+   int i, j;
+   double x;
+   for(i=0; i<m; ++i){
+      x=0;
+      for(j=row_start[i]; j<row_start[i+1]; ++j)
+         x+=value[j]*input[column_index[j]];
+      result[i]=x;
+   }
+   return QUERN_OK;
+}
+int QUERN_multiply_transpose(int m,
+                             int n,
+                             const int* row_start,
+                             const int* column_index,
+                             const double* value,
+                             const double* input,
+                             double* result)
+{
+   if(m<=0 || n<=0 || !row_start || !column_index || !value
+         || !input || !result)
+      return QUERN_INPUT_ERROR;
+   int i, j;
+   double x;
+   std::memset(result, 0, n*sizeof(double));
+   for(i=0; i<m; ++i){
+      x=input[i];
+      for(j=row_start[i]; j<row_start[i+1]; ++j)
+         result[column_index[j]]+=value[j]*x;
+   }
+   return QUERN_OK;
+}
--- a/libraries/quern/src/quern_factorization.cpp
+++ b/libraries/quern/src/quern_factorization.cpp
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include "quern.h"
+#include "quern_list.h"
+struct SparseEntry
+{
+   int index;
+   double value;
+   SparseEntry(void) {}
+   SparseEntry(int index_, double value_) : index(index_), value(value_) {}
+};
+typedef Quern::List<SparseEntry> SparseVector;
+static bool copy_row(int nnz,
+                     const int* index,
+                     const double* value,
+                     SparseVector& x)
+{
+   for(int i=nnz-1; i>=0; --i) if(value[i]){
+      if(!x.push_front(SparseEntry(index[i], value[i]))) return false;
+   }
+   return true;
+}
+// compute c>=0 and s, with c^2+s^2=1, so that s*a+c*b=0
+static void givens(double a,
+                   double b,
+                   double& c,
+                   double& s)
+{
+   if(b==0){
+      c=1;
+      s=0;
+   }else{
+      if(std::fabs(b)>std::fabs(a)){
+         double tau=-a/b;
+         s=1/std::sqrt(1+tau*tau);
+         c=s*tau;
+         if(c<0){ c=-c; s=-s; }
+      }else{
+         double tau=-b/a;
+         c=1/std::sqrt(1+tau*tau);
+         s=c*tau;
+      }
+   }
+}
+// given c and s (c>=0, c^2+s^2=1) encode them in a single double
+static double encode(double c,
+                     double s)
+{
+   if(std::fabs(s)<c)
+      return s; // will be at most sqrt(0.5)=0.707... in magnitude
+   else
+      return copysign(1/c, s); // will be least sqrt(2)=1.414... in magnitude
+}
+// given output from encode, reconstruct c and s (c>=0, c^2+s^2=1)
+static void decode(double tau,
+                   double& c,
+                   double& s)
+{
+   if(std::fabs(tau)<1){
+      s=tau;
+      c=std::sqrt(1-s*s);
+   }else{
+      c=1/tau;
+      s=std::sqrt(1-c*c);
+      if(c<0){ c=-c; s=-s; }
+   }
+}
+static bool apply_givens(SparseVector& x,
+                         SparseVector& y,
+                         int diagonal,
+                         double& c,
+                         double& s)
+{
+   assert(!x.empty() && x.front().index==diagonal && x.front().value);
+   assert(!y.empty() && y.front().index==diagonal && y.front().value);
+   // find the rotation we need
+   double a=x.front().value, b=y.front().value;
+   givens(a, b, c, s);
+   assert(c && s);
+   // rotate the start of each list
+   x.front().value=c*a-s*b;
+   y.pop_front();
+   // then update the rest (x_new = c*x-s*y and y_new = s*x+c*y)
+   Quern::ListIterator<SparseEntry> p=x.begin(), q=y.begin();
+   ++p; // skip the first value we already took care of
+   while(p.still_going() && q.still_going()){
+      if(p->index==q->index){
+         double xnew=c*p->value-s*q->value,
+                ynew=s*p->value+c*q->value;
+         if(xnew){
+            p->value=xnew;
+            ++p;
+         }else x.erase(p);
+         if(ynew){
+            q->value=ynew;
+            ++q;
+         }else y.erase(q);
+      }else if(p->index<q->index){
+         int k=p->index;
+         double xnew=c*p->value,
+                ynew=s*p->value;
+         p->value=xnew;
+         ++p;
+         if(!y.insert(q, SparseEntry(k, ynew))) return false;
+         ++q;
+      }else{
+         int k=q->index;
+         double xnew=-s*q->value,
+                ynew=c*q->value;
+         if(!x.insert(p, SparseEntry(k, xnew))) return false;
+         ++p;
+         q->value=ynew;
+         ++q;
+      }
+   }
+   if(p.still_going()){
+      do{
+         int k=p->index;
+         double xnew=c*p->value,
+                ynew=s*p->value;
+         p->value=xnew;
+         ++p;
+         if(!y.insert(q, SparseEntry(k, ynew))) return false;
+         ++q;
+      }while(p.still_going());
+   }else if(q.still_going()){
+      do{
+         int k=q->index;
+         double xnew=-s*q->value,
+                ynew=c*q->value;
+         if(!x.insert(p, SparseEntry(k, xnew))) return false;
+         ++p;
+         q->value=ynew;
+         ++q;
+      }while(q.still_going());
+   }
+   return true;
+}
+int QUERN_compute_qr_without_q(int m,
+                               int n,
+                               const int* A_row_start,
+                               const int* A_column_index,
+                               const double* A_value,
+                               const int* row_order,
+                               int** ptr_R_row_start,
+                               int** ptr_R_column_index,
+                               double** ptr_R_value)
+{
+   if(m<=0 || n<=0 || m<n || !A_row_start || !A_column_index || !A_value
+      || !ptr_R_row_start || !ptr_R_column_index || !ptr_R_value)
+      return QUERN_INPUT_ERROR;
+   // set up lists for dynamically building R
+   Quern::Pool<SparseEntry> pool;
+   SparseVector* R=(SparseVector*)std::malloc(m*sizeof(SparseVector));
+   if(!R) return QUERN_OUT_OF_MEMORY;
+   for(int i=0; i<m; ++i) R[i].init(&pool);
+   // do the Givens QR
+   SparseVector row;
+   row.init(&pool);
+   double c, s;
+   for(int a=0; a<m; ++a){
+      int i=(row_order ? row_order[a] : a);
+      if(!copy_row(A_row_start[i+1]-A_row_start[i],
+                   A_column_index+A_row_start[i],
+                   A_value+A_row_start[i],
+                   row)){
+         std::free(R);
+         return QUERN_OUT_OF_MEMORY;
+      }
+      while(!row.empty() && row.front().index<a && row.front().index<n){
+         int j=row.front().index;
+         if(R[j].empty() || R[j].front().index>j){ // swap?
+            R[j].swap(row);
+         }else{ // use Givens
+            if(!apply_givens(R[j], row, j, c, s)){
+               std::free(R);
+               return QUERN_OUT_OF_MEMORY;
+            }
+         }
+      }
+      if(a<n){
+         R[a].swap(row);
+         assert(R[a].empty() || R[a].front().index>=a);
+      }
+   }
+   // transfer R's lists to static CSR form
+   int* R_row_start=(int*)std::malloc((n+1)*sizeof(int));
+   if(!R_row_start){
+      std::free(R);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   R_row_start[0]=0;
+   for(int i=0; i<n; ++i)
+      R_row_start[i+1]=R_row_start[i]+R[i].size();
+   int Rnnz=R_row_start[n];
+   int* R_column_index=(int*)std::malloc(Rnnz*sizeof(int));
+   if(!R_column_index){
+      std::free(R);
+      std::free(R_row_start);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   double* R_value=(double*)std::malloc(Rnnz*sizeof(double));
+   if(!R_value){
+      std::free(R);
+      std::free(R_row_start);
+      std::free(R_column_index);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   int j=0;
+   for(int i=0; i<n; ++i){
+      Quern::ListIterator<SparseEntry> p;
+      for(p=R[i].begin(); p.still_going(); ++p){
+         R_column_index[j]=p->index;
+         R_value[j]=p->value;
+         ++j;
+      }
+   }
+   std::free(R);
+   *ptr_R_row_start=R_row_start;
+   *ptr_R_column_index=R_column_index;
+   *ptr_R_value=R_value;
+   return QUERN_OK;
+}
+int QUERN_compute_qr(int m,
+                     int n,
+                     const int* A_row_start,
+                     const int* A_column_index,
+                     const double* A_value,
+                     const int* row_order,
+                     int** ptr_Q_row_start,
+                     int** ptr_Q_column_index,
+                     double** ptr_Q_value,
+                     int** ptr_R_row_start,
+                     int** ptr_R_column_index,
+                     double** ptr_R_value)
+{
+   if(m<=0 || n<=0 || m<n || !A_row_start || !A_column_index || !A_value
+      || !ptr_Q_row_start || !ptr_Q_column_index || !ptr_Q_value
+      || !ptr_R_row_start || !ptr_R_column_index || !ptr_R_value)
+      return QUERN_INPUT_ERROR;
+   // set up lists for dynamically building Q and R
+   Quern::Pool<SparseEntry> pool;
+   SparseVector* Q=(SparseVector*)std::malloc(m*sizeof(SparseVector));
+   if(!Q) return QUERN_OUT_OF_MEMORY;
+   SparseVector* R=(SparseVector*)std::malloc(m*sizeof(SparseVector));
+   if(!R){
+      std::free(Q);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   for(int i=0; i<m; ++i){
+      Q[i].init(&pool);
+      R[i].init(&pool);
+   }
+   // do the Givens QR
+   SparseVector row;
+   row.init(&pool);
+   double c, s;
+   for(int a=0; a<m; ++a){
+      int i=(row_order ? row_order[a] : a);
+      if(!copy_row(A_row_start[i+1]-A_row_start[i],
+                   A_column_index+A_row_start[i],
+                   A_value+A_row_start[i],
+                   row)){
+         std::free(Q);
+         std::free(R);
+         return QUERN_OUT_OF_MEMORY;
+      }
+      Quern::ListIterator<SparseEntry> q=Q[a].begin();
+      while(!row.empty() && row.front().index<a && row.front().index<n){
+         int j=row.front().index;
+         if(R[j].empty() || R[j].front().index>j){ // swap?
+            R[j].swap(row);
+            Q[a].insert(q, SparseEntry(j, 1));
+            ++q;
+         }else{ // use Givens
+            if(!apply_givens(R[j], row, j, c, s)){
+               std::free(Q);
+               std::free(R);
+               return QUERN_OUT_OF_MEMORY;
+            }
+            Q[a].insert(q, SparseEntry(j, encode(c,s)));
+            ++q;
+         }
+      }
+      if(a<n){
+         R[a].swap(row);
+         assert(R[a].empty() || R[a].front().index>=a);
+      }
+   }
+   // transfer Q's lists to static CSR form
+   int* Q_row_start=(int*)std::malloc((m+1)*sizeof(int));
+   if(!Q_row_start){
+      std::free(Q);
+      std::free(R);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   Q_row_start[0]=0;
+   for(int i=0; i<m; ++i)
+      Q_row_start[i+1]=Q_row_start[i]+Q[i].size();
+   int Qnnz=Q_row_start[m];
+   int* Q_column_index=(int*)std::malloc(Qnnz*sizeof(int));
+   if(!Q_column_index){
+      std::free(Q);
+      std::free(R);
+      std::free(Q_row_start);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   double* Q_value=(double*)std::malloc(Qnnz*sizeof(double));
+   if(!Q_value){
+      std::free(Q);
+      std::free(R);
+      std::free(Q_row_start);
+      std::free(Q_column_index);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   int j=0;
+   for(int i=0; i<m; ++i){
+      Quern::ListIterator<SparseEntry> q;
+      for(q=Q[i].begin(); q.still_going(); ++q){
+         Q_column_index[j]=q->index;
+         Q_value[j]=q->value;
+         ++j;
+      }
+   }
+   std::free(Q);
+   // transfer R's lists to static CSR form
+   int* R_row_start=(int*)std::malloc((n+1)*sizeof(int));
+   if(!R_row_start){
+      std::free(Q);
+      std::free(R);
+      std::free(Q_row_start);
+      std::free(Q_column_index);
+      std::free(Q_value);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   R_row_start[0]=0;
+   for(int i=0; i<n; ++i)
+      R_row_start[i+1]=R_row_start[i]+R[i].size();
+   int Rnnz=R_row_start[n];
+   int* R_column_index=(int*)std::malloc(Rnnz*sizeof(int));
+   if(!R_column_index){
+      std::free(Q);
+      std::free(R);
+      std::free(Q_row_start);
+      std::free(Q_column_index);
+      std::free(Q_value);
+      std::free(R_row_start);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   double* R_value=(double*)std::malloc(Rnnz*sizeof(double));
+   if(!R_value){
+      std::free(Q);
+      std::free(R);
+      std::free(Q_row_start);
+      std::free(Q_column_index);
+      std::free(Q_value);
+      std::free(R_row_start);
+      std::free(R_column_index);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   j=0;
+   for(int i=0; i<n; ++i){
+      Quern::ListIterator<SparseEntry> p;
+      for(p=R[i].begin(); p.still_going(); ++p){
+         R_column_index[j]=p->index;
+         R_value[j]=p->value;
+         ++j;
+      }
+   }
+   std::free(R);
+   *ptr_Q_row_start=Q_row_start;
+   *ptr_Q_column_index=Q_column_index;
+   *ptr_Q_value=Q_value;
+   *ptr_R_row_start=R_row_start;
+   *ptr_R_column_index=R_column_index;
+   *ptr_R_value=R_value;
+   return QUERN_OK;
+}
+static void determine_column_thresholds(int m,
+                                        int n,
+                                        const int* A_row_start,
+                                        const int* A_column_index,
+                                        const double* A_value,
+                                        double drop_tolerance,
+                                        double* drop)
+{
+   std::memset(drop, 0, n*sizeof(double));
+   for(int i=0; i<m; ++i){
+      for(int j=A_row_start[i]; j<A_row_start[i+1]; ++j)
+         drop[A_column_index[j]]+=A_value[j]*A_value[j];
+   }
+   for(int i=0; i<n; ++i){
+      drop[i]=std::sqrt(drop[i])*drop_tolerance;
+   }
+}
+static bool apply_givens(SparseVector& x,
+                         SparseVector& y,
+                         int diagonal,
+                         const double* drop,
+                         double& c,
+                         double& s)
+{
+   assert(!x.empty() && x.front().index==diagonal && x.front().value);
+   assert(!y.empty() && y.front().index==diagonal && y.front().value);
+   // find the rotation we need
+   double a=x.front().value, b=y.front().value;
+   givens(a, b, c, s);
+   assert(c && s);
+   // rotate the start of each list
+   x.front().value=c*a-s*b;
+   y.pop_front();
+   // then update the rest (x_new = c*x-s*y and y_new = s*x+c*y)
+   Quern::ListIterator<SparseEntry> p=x.begin(), q=y.begin();
+   ++p; // skip the first value we already took care of
+   while(p.still_going() && q.still_going()){
+      if(p->index==q->index){
+         int k=p->index;
+         double xnew=c*p->value-s*q->value,
+                ynew=s*p->value+c*q->value;
+         if(std::fabs(xnew)>drop[k]){
+            p->value=xnew;
+            ++p;
+         }else x.erase(p);
+         if(std::fabs(ynew)>drop[k]){
+            q->value=ynew;
+            ++q;
+         }else y.erase(q);
+      }else if(p->index<q->index){
+         int k=p->index;
+         double xnew=c*p->value,
+                ynew=s*p->value;
+         if(std::fabs(xnew)>drop[k]){
+            p->value=xnew;
+            ++p;
+         }else x.erase(p);
+         if(std::fabs(ynew)>drop[k]){
+            if(!y.insert(q, SparseEntry(k, ynew))) return false;
+            ++q;
+         }
+      }else{
+         int k=q->index;
+         double xnew=-s*q->value,
+                ynew=c*q->value;
+         if(std::fabs(xnew)>drop[k]){
+            if(!x.insert(p, SparseEntry(k, xnew))) return false;
+            ++p;
+         }
+         if(std::fabs(ynew)>drop[k]){
+            q->value=ynew;
+            ++q;
+         }else y.erase(q);
+      }
+   }
+   if(p.still_going()){
+      do{
+         int k=p->index;
+         double xnew=c*p->value,
+                ynew=s*p->value;
+         if(std::fabs(xnew)>drop[k]){
+            p->value=xnew;
+            ++p;
+         }else x.erase(p);
+         if(std::fabs(ynew)>drop[k]){
+            if(!y.insert(q, SparseEntry(k, ynew))) return false;
+            ++q;
+         }
+      }while(p.still_going());
+   }else if(q.still_going()){
+      do{
+         int k=q->index;
+         double xnew=-s*q->value,
+                ynew=c*q->value;
+         if(std::fabs(xnew)>drop[k]){
+            if(!x.insert(p, SparseEntry(k, xnew))) return false;
+            ++p;
+         }
+         if(std::fabs(ynew)>drop[k]){
+            q->value=ynew;
+            ++q;
+         }else y.erase(q);
+      }while(q.still_going());
+   }
+   return true;
+}
+int QUERN_compute_incomplete_qr_without_q(int m,
+                                          int n,
+                                          const int* A_row_start,
+                                          const int* A_column_index,
+                                          const double* A_value,
+                                          const int* row_order,
+                                          double drop_tolerance,
+                                          int** ptr_R_row_start,
+                                          int** ptr_R_column_index,
+                                          double** ptr_R_value)
+{
+   if(m<=0 || n<=0 || m<n || !A_row_start || !A_column_index || !A_value
+      || !ptr_R_row_start || !ptr_R_column_index || !ptr_R_value)
+      return QUERN_INPUT_ERROR;
+   // find per-column drop tolerance
+   double* drop=(double*)std::malloc(n*sizeof(double));
+   if(!drop) return QUERN_OUT_OF_MEMORY;
+   determine_column_thresholds(m, n, A_row_start, A_column_index, A_value,
+                               drop_tolerance, drop);
+   // set up lists for dynamically building R
+   Quern::Pool<SparseEntry> pool;
+   SparseVector* R=(SparseVector*)std::malloc(m*sizeof(SparseVector));
+   if(!R){
+      std::free(drop);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   for(int i=0; i<m; ++i) R[i].init(&pool);
+   // do the Givens QR
+   SparseVector row;
+   row.init(&pool);
+   double c, s;
+   for(int a=0; a<m; ++a){
+      int i=(row_order ? row_order[a] : a);
+      if(!copy_row(A_row_start[i+1]-A_row_start[i],
+                   A_column_index+A_row_start[i],
+                   A_value+A_row_start[i],
+                   row)){
+         std::free(drop);
+         std::free(R);
+         return QUERN_OUT_OF_MEMORY;
+      }
+      while(!row.empty() && row.front().index<a && row.front().index<n){
+         int j=row.front().index;
+         if(R[j].empty() || R[j].front().index>j){ // swap?
+            R[j].swap(row);
+         }else{ // use Givens
+            if(!apply_givens(R[j], row, j, drop, c, s)){
+               std::free(drop);
+               std::free(R);
+               return QUERN_OUT_OF_MEMORY;
+            }
+         }
+      }
+      if(a<n){
+         R[a].swap(row);
+         assert(R[a].empty() || R[a].front().index>=a);
+      }
+   }
+   // don't need this any more
+   std::free(drop);
+   // transfer R's lists to static CSR form
+   int* R_row_start=(int*)std::malloc((n+1)*sizeof(int));
+   if(!R_row_start){
+      std::free(R);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   R_row_start[0]=0;
+   for(int i=0; i<n; ++i)
+      R_row_start[i+1]=R_row_start[i]+R[i].size();
+   int Rnnz=R_row_start[n];
+   int* R_column_index=(int*)std::malloc(Rnnz*sizeof(int));
+   if(!R_column_index){
+      std::free(R);
+      std::free(R_row_start);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   double* R_value=(double*)std::malloc(Rnnz*sizeof(double));
+   if(!R_value){
+      std::free(R);
+      std::free(R_row_start);
+      std::free(R_column_index);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   int j=0;
+   for(int i=0; i<n; ++i){
+      Quern::ListIterator<SparseEntry> p;
+      for(p=R[i].begin(); p.still_going(); ++p){
+         R_column_index[j]=p->index;
+         R_value[j]=p->value;
+         ++j;
+      }
+   }
+   std::free(R);
+   *ptr_R_row_start=R_row_start;
+   *ptr_R_column_index=R_column_index;
+   *ptr_R_value=R_value;
+   return QUERN_OK;
+}
+int QUERN_compute_incomplete_qr(int m,
+                                int n,
+                                const int* A_row_start,
+                                const int* A_column_index,
+                                const double* A_value,
+                                const int* row_order,
+                                double drop_tolerance,
+                                int** ptr_Q_row_start,
+                                int** ptr_Q_column_index,
+                                double** ptr_Q_value,
+                                int** ptr_R_row_start,
+                                int** ptr_R_column_index,
+                                double** ptr_R_value)
+{
+   if(m<=0 || n<=0 || m<n || !A_row_start || !A_column_index || !A_value
+      || !ptr_Q_row_start || !ptr_Q_column_index || !ptr_Q_value
+      || !ptr_R_row_start || !ptr_R_column_index || !ptr_R_value)
+      return QUERN_INPUT_ERROR;
+   // find per-column drop tolerance
+   double* drop=(double*)std::malloc(n*sizeof(double));
+   if(!drop) return QUERN_OUT_OF_MEMORY;
+   determine_column_thresholds(m, n, A_row_start, A_column_index, A_value,
+                               drop_tolerance, drop);
+   // set up lists for dynamically building Q and R
+   Quern::Pool<SparseEntry> pool;
+   SparseVector* Q=(SparseVector*)std::malloc(m*sizeof(SparseVector));
+   if(!Q){
+      std::free(drop);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   SparseVector* R=(SparseVector*)std::malloc(m*sizeof(SparseVector));
+   if(!R){
+      std::free(drop);
+      std::free(Q);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   for(int i=0; i<m; ++i){
+      Q[i].init(&pool);
+      R[i].init(&pool);
+   }
+   // do the Givens QR
+   SparseVector row;
+   row.init(&pool);
+   double c, s;
+   for(int a=0; a<m; ++a){
+      int i=(row_order ? row_order[a] : a);
+      if(!copy_row(A_row_start[i+1]-A_row_start[i],
+                   A_column_index+A_row_start[i],
+                   A_value+A_row_start[i],
+                   row)){
+         std::free(drop);
+         std::free(Q);
+         std::free(R);
+         return QUERN_OUT_OF_MEMORY;
+      }
+      Quern::ListIterator<SparseEntry> q=Q[a].begin();
+      while(!row.empty() && row.front().index<a && row.front().index<n){
+         int j=row.front().index;
+         if(R[j].empty() || R[j].front().index>j){ // swap?
+            R[j].swap(row);
+            Q[a].insert(q, SparseEntry(j, 1));
+            ++q;
+         }else{ // use Givens
+            if(!apply_givens(R[j], row, j, drop, c, s)){
+               std::free(drop);
+               std::free(Q);
+               std::free(R);
+               return QUERN_OUT_OF_MEMORY;
+            }
+            Q[a].insert(q, SparseEntry(j, encode(c,s)));
+            ++q;
+         }
+      }
+      if(a<n){
+         R[a].swap(row);
+         assert(R[a].empty() || R[a].front().index>=a);
+      }
+   }
+   // don't need this any more
+   std::free(drop);
+   // transfer Q's lists to static CSR form
+   int* Q_row_start=(int*)std::malloc((m+1)*sizeof(int));
+   if(!Q_row_start){
+      std::free(Q);
+      std::free(R);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   Q_row_start[0]=0;
+   for(int i=0; i<m; ++i)
+      Q_row_start[i+1]=Q_row_start[i]+Q[i].size();
+   int Qnnz=Q_row_start[m];
+   int* Q_column_index=(int*)std::malloc(Qnnz*sizeof(int));
+   if(!Q_column_index){
+      std::free(Q);
+      std::free(R);
+      std::free(Q_row_start);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   double* Q_value=(double*)std::malloc(Qnnz*sizeof(double));
+   if(!Q_value){
+      std::free(Q);
+      std::free(R);
+      std::free(Q_row_start);
+      std::free(Q_column_index);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   int j=0;
+   for(int i=0; i<m; ++i){
+      Quern::ListIterator<SparseEntry> q;
+      for(q=Q[i].begin(); q.still_going(); ++q){
+         Q_column_index[j]=q->index;
+         Q_value[j]=q->value;
+         ++j;
+      }
+   }
+   std::free(Q);
+   // transfer R's lists to static CSR form
+   int* R_row_start=(int*)std::malloc((n+1)*sizeof(int));
+   if(!R_row_start){
+      std::free(Q);
+      std::free(R);
+      std::free(Q_row_start);
+      std::free(Q_column_index);
+      std::free(Q_value);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   R_row_start[0]=0;
+   for(int i=0; i<n; ++i)
+      R_row_start[i+1]=R_row_start[i]+R[i].size();
+   int Rnnz=R_row_start[n];
+   int* R_column_index=(int*)std::malloc(Rnnz*sizeof(int));
+   if(!R_column_index){
+      std::free(Q);
+      std::free(R);
+      std::free(Q_row_start);
+      std::free(Q_column_index);
+      std::free(Q_value);
+      std::free(R_row_start);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   double* R_value=(double*)std::malloc(Rnnz*sizeof(double));
+   if(!R_value){
+      std::free(Q);
+      std::free(R);
+      std::free(Q_row_start);
+      std::free(Q_column_index);
+      std::free(Q_value);
+      std::free(R_row_start);
+      std::free(R_column_index);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   j=0;
+   for(int i=0; i<n; ++i){
+      Quern::ListIterator<SparseEntry> p;
+      for(p=R[i].begin(); p.still_going(); ++p){
+         R_column_index[j]=p->index;
+         R_value[j]=p->value;
+         ++j;
+      }
+   }
+   std::free(R);
+   *ptr_Q_row_start=Q_row_start;
+   *ptr_Q_column_index=Q_column_index;
+   *ptr_Q_value=Q_value;
+   *ptr_R_row_start=R_row_start;
+   *ptr_R_column_index=R_column_index;
+   *ptr_R_value=R_value;
+   return QUERN_OK;
+}
+void QUERN_free_result(int* row_start,
+                       int* column_index,
+                       double* value)
+{
+   std::free(row_start);
+   std::free(column_index);
+   std::free(value);
+}
+int QUERN_multiply_with_q(int m,
+                          const int* Q_row_start,
+                          const int* Q_column_index,
+                          const double* Q_value,
+                          double* x)
+{
+   if(m<=0 || !Q_row_start || !Q_column_index || !Q_value)
+      return QUERN_INPUT_ERROR;
+   int i, j, k;
+   double c, s;
+   for(i=m-1; i>=0; --i){
+      for(j=Q_row_start[i+1]-1; j>=Q_row_start[i]; --j){
+         k=Q_column_index[j];
+         if(Q_value[j]==1.){ // swap?
+            std::swap(x[i], x[k]);
+         }else{
+            decode(Q_value[j], c, s);
+            double newxk=c*x[k]+s*x[i];
+            x[i]=-s*x[k]+c*x[i];
+            x[k]=newxk;
+         }
+      }
+   }
+   return QUERN_OK;
+}
+int QUERN_multiply_with_q_transpose(int m,
+                                    const int* Q_row_start,
+                                    const int* Q_column_index,
+                                    const double* Q_value,
+                                    double* x)
+{
+   if(m<=0 || !Q_row_start || !Q_column_index || !Q_value)
+      return QUERN_INPUT_ERROR;
+   int i, j, k;
+   double c, s;
+   for(i=0; i<m; ++i){
+      for(j=Q_row_start[i]; j<Q_row_start[i+1]; ++j){
+         k=Q_column_index[j];
+         if(Q_value[j]==1){ // swap?
+            std::swap(x[i], x[k]);
+         }else{
+            decode(Q_value[j], c, s);
+            double newxk=c*x[k]-s*x[i];
+            x[i]=s*x[k]+c*x[i];
+            x[k]=newxk;
+         }
+      }
+   }
+   return QUERN_OK;
+}
+int QUERN_solve_with_r(int n,
+                       const int* R_row_start,
+                       const int* R_column_index,
+                       const double* R_value,
+                       const double* rhs,
+                       double* result)
+{
+   // check input
+   if(n<=0 || !R_row_start || !R_column_index || !R_value || !rhs || !result)
+      return QUERN_INPUT_ERROR;
+   // do the solve
+   double x, rii;
+   int i, j;
+   for(i=n-1; i>=0; --i){
+      x=rhs[i];
+      rii=0;
+      j=R_row_start[i];
+      if(j<R_row_start[i+1] && R_column_index[j]==i){
+         rii=R_value[j];
+         ++j;
+      }
+      if(rii){
+         for(; j<R_row_start[i+1]; ++j){
+            assert(R_column_index[j]>i && R_column_index[j]<n);
+            x-=R_value[j]*result[R_column_index[j]];
+         }
+         result[i]=x/rii;
+      }else
+         result[i]=0;
+   }
+   return QUERN_OK;
+}
+int QUERN_solve_with_r_transpose_in_place(int n,
+                                          const int* R_row_start,
+                                          const int* R_column_index,
+                                          const double* R_value,
+                                          double* x)
+{
+   // check input
+   if(n<=0 || !R_row_start || !R_column_index || !R_value || !x)
+      return QUERN_INPUT_ERROR;
+   // do the solve
+   double rii;
+   int i, j;
+   for(i=0; i<n; ++i){
+      rii=0;
+      j=R_row_start[i];
+      if(j<R_row_start[i+1] && R_column_index[j]==i){
+         rii=R_value[j];
+         ++j;
+      }
+      if(rii){
+         x[i]/=rii;
+         for(; j<R_row_start[i+1]; ++j){
+            assert(R_column_index[j]>i && R_column_index[j]<n);
+            x[R_column_index[j]]-=R_value[j]*x[i];
+         }
+      }else
+         x[i]=0;
+   }
+   return QUERN_OK;
+}
--- a/libraries/quern/src/quern_order.cpp
+++ b/libraries/quern/src/quern_order.cpp
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+#include "quern.h"
+int QUERN_get_rbfs_column_ordering(int m,
+                                   int n,
+                                   const int* A_row_start,
+                                   const int* A_column_index,
+                                   int* column_order)
+{
+   if(m<=0 || n<=0 || !A_row_start || !A_column_index || !column_order)
+      return QUERN_INPUT_ERROR;
+   // get some memory to work in
+   int* work=(int*)std::malloc((n+1+A_row_start[m]+n+n)*sizeof(int));
+   if(!work) return QUERN_OUT_OF_MEMORY;
+   // figure out number of entries in each column
+   int* column_start=work;
+   std::memset(column_start, 0, (n+1)*sizeof(int));
+   for(int i=0; i<m; ++i){
+      for(int j=A_row_start[i]; j<A_row_start[i+1]; ++j)
+         ++column_start[A_column_index[j]+1];
+   }
+   // cumulative sum to get column_start
+   for(int i=0; i<n; ++i)
+      column_start[i+1]+=column_start[i];
+   assert(column_start[n]==A_row_start[m]);
+   // list the columns now
+   int* row_index=column_start+(n+1);
+   int* column_pointer=row_index+column_start[n];
+   std::memcpy(column_pointer, column_start, n*sizeof(int));
+   for(int i=0; i<m; ++i){
+      for(int j=A_row_start[i]; j<A_row_start[i+1]; ++j){
+         int c=A_column_index[j];
+         row_index[column_pointer[c]++]=i;
+      }
+   }
+   // set up marker for BFS
+   char* column_marker=(char*)(column_pointer+n);
+   std::memset(column_marker, 0, n);
+   // and do as many BFS as we need to hit all connected components
+   int p=n;
+   for(int root=0; root<n; ++root) if(!column_marker[root]){
+      column_order[--p]=root;
+      column_marker[root]=1;
+      for(int i=p; i>=p; --i){
+         int j=column_order[i];
+         // add unmarked neighbour columns of j to ordering
+         for(int k=column_start[j]; k<column_start[j+1]; ++k){
+            int r=row_index[k];
+            for(int a=A_row_start[r]; a<A_row_start[r+1]; ++a){
+               int nbr=A_column_index[a];
+               if(!column_marker[nbr]){
+                  column_order[--p]=nbr;
+                  column_marker[nbr]=1;
+               }
+            }
+         }
+      }
+   }
+   assert(p==0);
+   std::free(work);
+   return QUERN_OK;
+}
+int QUERN_reorder_columns(int m,
+                          int n,
+                          const int* column_order,
+                          const int* A_row_start,
+                          int* A_column_index,
+                          double* A_value)
+{
+   if(m<=0 || n<=0 || !column_order
+         || !A_row_start || !A_column_index || !A_value)
+      return QUERN_INPUT_ERROR;
+   // Since we don't expect to have lots of empty columns, it would
+   // probably be superior to do this as two transposes --- but we'll
+   // worry about that optimization later.
+   // get some temporary storage for permuting and sorting
+   std::pair<int,double>* row=(std::pair<int,double>*)
+                                  std::malloc(n*sizeof(std::pair<int,double>));
+   if(!row) return QUERN_OUT_OF_MEMORY;
+   // and invert the permutation
+   int* inv=(int*)std::malloc(n*sizeof(int));
+   if(!inv){
+      std::free(row);
+      return QUERN_OUT_OF_MEMORY;
+   }
+   for(int i=0; i<n; ++i) inv[column_order[i]]=i;
+   // do it row by row
+   int k;
+   for(int i=0; i<m; ++i){
+      k=0;
+      for(int j=A_row_start[i]; j<A_row_start[i+1]; ++j)
+         row[k++]=std::make_pair(inv[A_column_index[j]], A_value[j]);
+      std::sort(row, row+k); // at some point should replace with radix-sort
+      k=0;
+      for(int j=A_row_start[i]; j<A_row_start[i+1]; ++j){
+         A_column_index[j]=row[k].first;
+         A_value[j]=row[k].second;
+         ++k;
+      } 
+   }
+   std::free(row);
+   std::free(inv);
+   return QUERN_OK;
+}
+int QUERN_get_profile_row_ordering(int m,
+                                   int n,
+                                   const int* A_row_start,
+                                   const int* A_column_index,
+                                   int* row_order)
+{
+   if(m<=0 || n<=0 || !A_row_start || !A_column_index || !row_order)
+      return QUERN_INPUT_ERROR;
+   // first count how many rows start at column i for each i=0, .., n-1
+   int* count=(int*)std::calloc(n+1, sizeof(int));
+   if(!count) return QUERN_OUT_OF_MEMORY;
+   for(int i=0; i<m; ++i)
+      if(A_row_start[i]<A_row_start[i+1])
+         ++count[A_column_index[A_row_start[i]]+1];
+   // then do a cumulative sum to find target locations of the rows
+   for(int j=2; j<n+1; ++j)
+      count[j]+=count[j-1];
+   // and now write out the row ordering with a second pass
+   for(int i=0; i<m; ++i){
+      if(A_row_start[i]<A_row_start[i+1])
+         row_order[count[A_column_index[A_row_start[i]]]++]=i;
+      else
+         row_order[count[n]++]=i;
+   }
+   std::free(count);
+   return QUERN_OK;
+}
+// Reorders a given input vector x into an output vector y of length m.
+int QUERN_reorder_vector(int m,
+                         const int* order,
+                         const double* x,
+                         double* y)
+{
+   if(m<=0 || !order || !x || !y) return QUERN_INPUT_ERROR;
+   for(int i=0; i<m; ++i)
+      y[i]=x[order[i]];
+   return QUERN_OK;
+}
+// Applies the inverse order to a given input vector x, saving in an output
+// vector y of length m.
+int QUERN_inverse_order_vector(int m,
+                               const int* order,
+                               const double* x,
+                               double* y)
+{
+   if(m<=0 || !order || !x || !y) return QUERN_INPUT_ERROR;
+   for(int i=0; i<m; ++i)
+      y[order[i]]=x[i];
+   return QUERN_OK;
+}
--- a/libraries/quern/src/quern_solver.cpp
+++ b/libraries/quern/src/quern_solver.cpp
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include "quern.h"
+// TO-DO: use BLAS versions of these smaller kernels, if available
+double two_norm(int n, const double* x)
+{
+   double r=0;
+   for(int i=0; i<n; ++i) r+=x[i]*x[i];
+   return std::sqrt(r);
+}
+double two_norm_squared(int n, const double* x)
+{
+   double r=0;
+   for(int i=0; i<n; ++i) r+=x[i]*x[i];
+   return r;
+}
+// x=x+alpha*y
+void add_scaled(int n, double* x, double alpha, const double* y)
+{
+   for(int i=0; i<n; ++i) x[i]+=alpha*y[i];
+}
+// x=beta*x+y
+void scale_and_add(int n, double beta, double* x, const double* y)
+{
+   for(int i=0; i<n; ++i) x[i]=beta*x[i]+y[i];
+}
+int QUERN_solve_with_CGNR(int m,
+                          int n,
+                          const int* A_row_start,
+                          const int* A_column_index,
+                          const double* A_value,
+                          const double* rhs,
+                          const int* R_row_start,
+                          const int* R_column_index,
+                          const double* R_value,
+                          int max_iterations,
+                          double absolute_convergence_tolerance,
+                          double* x,
+                          int* return_solved,
+                          int* return_iterations,
+                          double* return_residual_norm)
+{
+   if(m<=0 || n<=0 || !A_row_start || !A_column_index || !A_value
+      || !rhs || !R_row_start || !R_column_index || !R_value || !x
+      || !return_solved || !return_iterations || !return_residual_norm)
+      return QUERN_INPUT_ERROR;
+   // default values
+   *return_solved=0;
+   *return_iterations=0;
+   *return_residual_norm=two_norm(n, rhs);
+   if(*return_residual_norm<=absolute_convergence_tolerance){
+      *return_solved=1;
+      return QUERN_OK;
+   }
+   // allocate some room to work in
+   double* working_vectors=(double*)std::malloc((3*n+m)*sizeof(double));
+   if(!working_vectors)
+      return QUERN_OUT_OF_MEMORY;
+   double* r=working_vectors;
+   double* s=r+n;
+   double* z=s+n;
+   double* u=z+n;
+   // set up CGNR
+   int check; 
+   std::memset(x, 0, n*sizeof(double));
+   std::memcpy(r, rhs, n*sizeof(double));
+   std::memcpy(u, rhs, n*sizeof(double));
+   check=QUERN_solve_with_r_transpose_in_place(n, R_row_start, R_column_index,
+                                               R_value, u);
+   if(check){ std::free(working_vectors); return check; }
+   check=QUERN_solve_with_r(n, R_row_start, R_column_index, R_value, u, z);
+   if(check){ std::free(working_vectors); return check; }
+   std::memcpy(s, z, n*sizeof(double));
+   double rho=two_norm_squared(n, u);
+   // the main loop
+   for(;;){
+      if(rho==0){ std::free(working_vectors); return QUERN_INPUT_ERROR; }
+      check=QUERN_multiply(m, n, A_row_start, A_column_index, A_value, s, u);
+      if(check){ std::free(working_vectors); return check; }
+      check=QUERN_multiply_transpose(m, n, A_row_start, A_column_index, A_value,
+                                     u, z);
+      if(check){ std::free(working_vectors); return check; }
+      double denom=two_norm_squared(m, u);
+      if(denom==0){ std::free(working_vectors); return QUERN_INPUT_ERROR; }
+      double alpha=rho/denom;
+      add_scaled(n, x, alpha, s);
+      add_scaled(n, r, -alpha, z);
+      ++*return_iterations;
+      *return_residual_norm=two_norm(n, r);
+      if(*return_residual_norm<=absolute_convergence_tolerance){
+         *return_solved=1;
+         break;
+      }
+      if(*return_iterations>max_iterations)
+         break;
+      std::memcpy(u, r, n*sizeof(double));
+      check=QUERN_solve_with_r_transpose_in_place(n, R_row_start,
+                                                  R_column_index, R_value, u);
+      if(check){ std::free(working_vectors); return check; }
+      check=QUERN_solve_with_r(n, R_row_start, R_column_index, R_value, u, z);
+      if(check){ std::free(working_vectors); return check; }
+      double rho_new=two_norm_squared(n, u);
+      double beta=rho_new/rho;
+      scale_and_add(n, beta, s, z);
+      rho=rho_new;
+   }
+   std::free(working_vectors);
+   return QUERN_OK;
+}
--- a/platforms/reference/src/ReferenceKernels.cpp
+++ b/platforms/reference/src/ReferenceKernels.cpp
@@ -58,21 +58,21 @@
 using namespace OpenMM;
 using namespace std;
-int** allocateIntArray(int length, int width) {
+static int** allocateIntArray(int length, int width) {
    int** array = new int*[length];
    for (int i = 0; i < length; ++i)
        array[i] = new int[width];
    return array;
 }
-RealOpenMM** allocateRealArray(int length, int width) {
+static RealOpenMM** allocateRealArray(int length, int width) {
    RealOpenMM** array = new RealOpenMM*[length];
    for (int i = 0; i < length; ++i)
        array[i] = new RealOpenMM[width];
    return array;
 }
-int** copyToArray(const vector<vector<int> > vec) {
+static int** copyToArray(const vector<vector<int> > vec) {
    if (vec.size() == 0)
        return new int*[0];
    int** array = allocateIntArray(vec.size(), vec[0].size());
@@ -82,7 +82,7 @@ int** copyToArray(const vector<vector<int> > vec) {
    return array;
 }
-RealOpenMM** copyToArray(const vector<vector<double> > vec) {
+static RealOpenMM** copyToArray(const vector<vector<double> > vec) {
    if (vec.size() == 0)
        return new RealOpenMM*[0];
    RealOpenMM** array = allocateRealArray(vec.size(), vec[0].size());
@@ -92,7 +92,7 @@ RealOpenMM** copyToArray(const vector<vector<double> > vec) {
    return array;
 }
-void disposeIntArray(int** array, int size) {
+static void disposeIntArray(int** array, int size) {
    if (array) {
        for (int i = 0; i < size; ++i)
            delete[] array[i];
@@ -100,7 +100,7 @@ void disposeIntArray(int** array, int size) {
    }
 }
-void disposeRealArray(RealOpenMM** array, int size) {
+static void disposeRealArray(RealOpenMM** array, int size) {
    if (array) {
        for (int i = 0; i < size; ++i)
            delete[] array[i];
@@ -108,6 +108,20 @@ void disposeRealArray(RealOpenMM** array, int size) {
    }
 }
+static void findAnglesForShake(const System& system, vector<ReferenceRigidShakeAlgorithm::AngleInfo>& angles) {
+    for (int i = 0; i < system.getNumForces(); i++) {
+        const HarmonicAngleForce* force = dynamic_cast<const HarmonicAngleForce*>(&system.getForce(i));
+        if (force != NULL) {
+            for (int j = 0; j < force->getNumAngles(); j++) {
+                int atom1, atom2, atom3;
+                double angle, k;
+                force->getAngleParameters(j, atom1, atom2, atom3, angle, k);
+                angles.push_back(ReferenceRigidShakeAlgorithm::AngleInfo(atom1, atom2, atom3, angle));
+            }
+        }
+    }
+}
 void ReferenceInitializeForcesKernel::initialize(const System& system) {
 }
@@ -609,7 +623,9 @@ void ReferenceIntegrateVerletStepKernel::execute(OpenMMContextImpl& context, con
            delete constraints;
        }
        dynamics = new ReferenceVerletDynamics(context.getSystem().getNumParticles(), static_cast<RealOpenMM>(stepSize) );
-        constraints = new ReferenceRigidShakeAlgorithm(context.getSystem().getNumParticles(), numConstraints, constraintIndices, constraintDistances, masses, (RealOpenMM)integrator.getConstraintTolerance());
+        vector<ReferenceRigidShakeAlgorithm::AngleInfo> angles;
+        findAnglesForShake(context.getSystem(), angles);
+        constraints = new ReferenceRigidShakeAlgorithm(context.getSystem().getNumParticles(), numConstraints, constraintIndices, constraintDistances, masses, angles, (RealOpenMM)integrator.getConstraintTolerance());
        dynamics->setReferenceConstraintAlgorithm(constraints);
        prevStepSize = stepSize;
    }
@@ -668,7 +684,9 @@ void ReferenceIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, c
 				static_cast<RealOpenMM>(stepSize), 
 				static_cast<RealOpenMM>(tau), 
 				static_cast<RealOpenMM>(temperature) );
-        constraints = new ReferenceRigidShakeAlgorithm(context.getSystem().getNumParticles(), numConstraints, constraintIndices, constraintDistances, masses, (RealOpenMM)integrator.getConstraintTolerance());
+        vector<ReferenceRigidShakeAlgorithm::AngleInfo> angles;
+        findAnglesForShake(context.getSystem(), angles);
+        constraints = new ReferenceRigidShakeAlgorithm(context.getSystem().getNumParticles(), numConstraints, constraintIndices, constraintDistances, masses, angles, (RealOpenMM)integrator.getConstraintTolerance());
        dynamics->setReferenceConstraintAlgorithm(constraints);
        prevTemp = temperature;
        prevFriction = friction;
@@ -728,7 +746,9 @@ void ReferenceIntegrateBrownianStepKernel::execute(OpenMMContextImpl& context, c
 				static_cast<RealOpenMM>(stepSize), 
 				static_cast<RealOpenMM>(friction), 
 				static_cast<RealOpenMM>(temperature) );
-        constraints = new ReferenceRigidShakeAlgorithm(context.getSystem().getNumParticles(), numConstraints, constraintIndices, constraintDistances, masses, (RealOpenMM)integrator.getConstraintTolerance());
+        vector<ReferenceRigidShakeAlgorithm::AngleInfo> angles;
+        findAnglesForShake(context.getSystem(), angles);
+        constraints = new ReferenceRigidShakeAlgorithm(context.getSystem().getNumParticles(), numConstraints, constraintIndices, constraintDistances, masses, angles, (RealOpenMM)integrator.getConstraintTolerance());
        dynamics->setReferenceConstraintAlgorithm(constraints);
        prevTemp = temperature;
        prevFriction = friction;

--- a/platforms/reference/src/SimTKReference/ReferenceRigidShakeAlgorithm.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceRigidShakeAlgorithm.cpp
@@ -31,10 +31,12 @@
 #include "ReferenceRigidShakeAlgorithm.h"
 #include "ReferenceDynamics.h"
 #include "jama_svd.h"
+#include "quern.h"
 #include "openmm/Vec3.h"
 #include <map>
 using std::map;
+using std::pair;
 using std::vector;
 using std::set;
 using OpenMM::Vec3;
@@ -58,6 +60,7 @@ ReferenceRigidShakeAlgorithm::ReferenceRigidShakeAlgorithm( int numberOfAtoms,
                                                  int** atomIndices,
                                                  RealOpenMM* distance,
                                                  RealOpenMM* masses,
+                                                  vector<AngleInfo>& angles,
                                                  RealOpenMM tolerance){
   // ---------------------------------------------------------------------------------------
@@ -92,156 +95,274 @@ ReferenceRigidShakeAlgorithm::ReferenceRigidShakeAlgorithm( int numberOfAtoms,
       _distanceTolerance          = SimTKOpenMMUtilities::allocateOneDRealOpenMMArray( numberOfConstraints, NULL, 1, zero, "distanceTolerance" );
       _reducedMasses              = SimTKOpenMMUtilities::allocateOneDRealOpenMMArray( numberOfConstraints, NULL, 1, zero, "reducedMasses" );
   }
+   if (numberOfConstraints > 0)
-   // Identify rigid clusters.
+   {
+       // Compute the constraint coupling matrix
-   vector<map<int, int> > atomConstraints(numberOfAtoms);
-   for (int i = 0; i < numberOfConstraints; i++) {
+       vector<vector<int> > atomAngles(numberOfAtoms);
-       atomConstraints[atomIndices[i][0]][atomIndices[i][1]] = i;
+       for (int i = 0; i < (int) angles.size(); i++)
-       atomConstraints[atomIndices[i][1]][atomIndices[i][0]] = i;
+           atomAngles[angles[i].atom2].push_back(i);
-   }
+       vector<vector<pair<int, double> > > matrix(numberOfConstraints);
-   for (int i = 0; i < numberOfAtoms; i++) {
+       for (int j = 0; j < numberOfConstraints; j++) {
-       if (atomConstraints[i].size() < 2)
+           for (int k = 0; k < numberOfConstraints; k++) {
-           continue;
+               if (j == k) {
+                   matrix[j].push_back(pair<int, double>(j, 1.0));
-       // Begin by looking for a triangle this atom is part of.
+                   continue;
-       set<int> atoms;
-       atoms.insert(i);
-       for (map<int, int>::const_iterator atom1 = atomConstraints[i].begin(); atom1 != atomConstraints[i].end() && atoms.size() == 1; ++atom1) {
-           for (map<int, int>::const_iterator atom2 = atomConstraints[atom1->first].begin(); atom2 != atomConstraints[atom1->first].end(); ++atom2) {
-               if (atomConstraints[i].count(atom2->first) != 0) {
-                   atoms.insert(atom1->first);
-                   atoms.insert(atom2->first);
-                   break;
-               }
-           }
-       }
-       if (atoms.size() == 1)
-           continue;
-       // We have three atoms that are part of a cluster, so look for other atoms we can add.
-       bool done = false;
-       while (!done) {
-           done = true;
-           for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
-               for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2) {
-                   if (atoms.find(atom2->first) != atoms.end())
-                       continue; // This atom is already in the cluster.
-                   // See if this atom is linked to three other atoms in the cluster.
-                   int linkCount = 0;
-                   for (map<int, int>::const_iterator atom3 = atomConstraints[atom2->first].begin(); atom3 != atomConstraints[atom2->first].end(); ++atom3)
-                       if (atoms.find(atom3->first) != atoms.end())
-                           linkCount++;
-                   if (linkCount > 2) {
-                       atoms.insert(atom2->first);
-                       done = false;
-                   }
               }
-           }
-       }
-       // Record the cluster.
-       vector<int> constraints;
-       for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
-           for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2) {
-               if (*atom1 < atom2->first && atoms.find(atom2->first) != atoms.end())
-                   constraints.push_back(atom2->second);
-           }
-       }
-       _rigidClusters.push_back(constraints);
-       for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
-           for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2)
-               atomConstraints[atom2->first].erase(*atom1);
-           atomConstraints[*atom1].clear();
-       }
-       // Compute the constraint coupling matrix for this cluster.
-       const vector<int>& cluster = _rigidClusters[i];
-       unsigned int size = cluster.size();
-       Array2D<double> matrix(size, size);
-       for (int j = 0; j < (int)size; j++) {
-           for (int k = 0; k < (int)size; k++) {
               double scale;
-               int atomj0 = _atomIndices[cluster[j]][0];
+               int atomj0 = _atomIndices[j][0];
-               int atomj1 = _atomIndices[cluster[j]][1];
+               int atomj1 = _atomIndices[j][1];
-               int atomk0 = _atomIndices[cluster[k]][0];
+               int atomk0 = _atomIndices[k][0];
-               int atomk1 = _atomIndices[cluster[k]][1];
+               int atomk1 = _atomIndices[k][1];
               RealOpenMM invMass0 = one/masses[atomj0];
               RealOpenMM invMass1 = one/masses[atomj1];
-               int atoma, atomb;
+               int atoma, atomb, atomc;
               if (atomj0 == atomk0) {
                   atoma = atomj1;
-                   atomb = atomk1;
+                   atomb = atomj0;
+                   atomc = atomk1;
                   scale = invMass0/(invMass0+invMass1);
               }
               else if (atomj1 == atomk1) {
                   atoma = atomj0;
-                   atomb = atomk0;
+                   atomb = atomj1;
+                   atomc = atomk0;
                   scale = invMass1/(invMass0+invMass1);
               }
               else if (atomj0 == atomk1) {
                   atoma = atomj1;
-                   atomb = atomk0;
+                   atomb = atomj0;
+                   atomc = atomk0;
                   scale = invMass0/(invMass0+invMass1);
               }
               else if (atomj1 == atomk0) {
                   atoma = atomj0;
-                   atomb = atomk1;
+                   atomb = atomj1;
+                   atomc = atomk1;
                   scale = invMass1/(invMass0+invMass1);
               }
-               else {
+               else
-                   matrix[j][k] = 0.0;
                   continue; // These constraints are not connected.
-               }
-               // Find the third constraint forming a triangle with these two.
+               // Look for a third constraint forming a triangle with these two.
-               for (int m = 0; m < size; m++) {
+               bool foundConstraint = false;
-                   int other = cluster[m];
+               for (int other = 0; other < numberOfConstraints; other++) {
-                   if ((_atomIndices[other][0] == atoma && _atomIndices[other][1] == atomb) || (_atomIndices[other][0] == atomb && _atomIndices[other][1] == atoma)) {
+                   if ((_atomIndices[other][0] == atoma && _atomIndices[other][1] == atomc) || (_atomIndices[other][0] == atomc && _atomIndices[other][1] == atoma)) {
-                       double d1 = _distance[cluster[j]];
+                       double d1 = _distance[j];
-                       double d2 = _distance[cluster[k]];
+                       double d2 = _distance[k];
                       double d3 = _distance[other];
-                       matrix[j][k] = scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2);
+                       matrix[j].push_back(pair<int, double>(k, scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2)));
+                       foundConstraint = true;
                       break;
                   }
               }
+               if (!foundConstraint) {
+                   // We didn't find one, so look for an angle force field term.
+                   const vector<int>& angleCandidates = atomAngles[atomb];
+                   for (vector<int>::const_iterator iter = angleCandidates.begin(); iter != angleCandidates.end(); iter++) {
+                       const AngleInfo& angle = angles[*iter];
+                       if ((angle.atom1 == atoma && angle.atom3 == atomc) || (angle.atom3 == atoma && angle.atom1 == atomc)) {
+                           matrix[j].push_back(pair<int, double>(k, scale*cos(angle.angle)));
+                           break;
+                       }
+                   }
+               }
           }
-           matrix[j][j] = 1.0;
       }
-       // Invert it using SVD.
+       // Invert it using QR.
-       Array2D<double> u, v;
+       vector<int> matrixRowStart;
-       Array1D<double> w;
+       vector<int> matrixColIndex;
-       SVD<double> svd(matrix);
+       vector<double> matrixValue;
-       svd.getU(u);
+       for (int i = 0; i < numberOfConstraints; i++) {
-       svd.getV(v);
+           matrixRowStart.push_back(matrixValue.size());
-       svd.getSingularValues(w);
+           for (int j = 0; j < (int) matrix[i].size(); j++) {
-       double singularValueCutoff = 0.01*w[0];
+               pair<int, double> element = matrix[i][j];
-       for (int j = 0; j < (int)size; j++)
+               matrixColIndex.push_back(element.first);
-           w[j] = (w[j] < singularValueCutoff ? 0.0 : 1.0/w[j]);
+               matrixValue.push_back(element.second);
-       for (int j = 0; j < (int)size; j++) {
-           for (int k = 0; k < (int)size; k++) {
-               matrix[j][k] = 0.0;
-               for (int m = 0; m < (int)size; m++)
-                   matrix[j][k] += v[j][m]*w[m]*u[k][m];
           }
       }
+       matrixRowStart.push_back(matrixValue.size());
+       int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
+       double *qValue, *rValue;
+       QUERN_compute_qr(numberOfConstraints, numberOfConstraints, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
+               &qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
+       vector<double> rhs(numberOfConstraints);
+       _matrix.resize(numberOfConstraints);
+       for (int i = 0; i < numberOfConstraints; i++) {
+           // Extract column i of the inverse matrix.
+           for (int j = 0; j < numberOfConstraints; j++)
+               rhs[j] = (i == j ? 1.0 : 0.0);
+           QUERN_multiply_with_q_transpose(numberOfConstraints, qRowStart, qColIndex, qValue, &rhs[0]);
+           QUERN_solve_with_r(numberOfConstraints, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
+           for (int j = 0; j < numberOfConstraints; j++) {
+               double value = rhs[j]*_distance[i]/_distance[j];
+               if (abs(value) > 0.01)
+                   _matrix[j].push_back(pair<int, RealOpenMM>(i, (RealOpenMM) value));
+           }
+       }
+       QUERN_free_result(qRowStart, qColIndex, qValue);
+       QUERN_free_result(rRowStart, rColIndex, rValue);
+   }
-       // Record the inverted matrix.
-       _matrices.push_back(SimTKOpenMMUtilities::allocateTwoDRealOpenMMArray(constraints.size(), constraints.size(), NULL, false, 0.0, ""));
-       for (int j = 0; j < (int)size; j++)
-           for (int k = 0; k < (int)size; k++)
-               _matrices[i][j][k] = (RealOpenMM)matrix[j][k]*_distance[cluster[k]]/_distance[cluster[j]];
-   }
+//   // Identify rigid clusters.
+//
+//   vector<map<int, int> > atomConstraints(numberOfAtoms);
+//   for (int i = 0; i < numberOfConstraints; i++) {
+//       atomConstraints[atomIndices[i][0]][atomIndices[i][1]] = i;
+//       atomConstraints[atomIndices[i][1]][atomIndices[i][0]] = i;
+//   }
+//   for (int i = 0; i < numberOfAtoms; i++) {
+//       if (atomConstraints[i].size() < 2)
+//           continue;
+//
+//       // Begin by looking for a triangle this atom is part of.
+//
+//       set<int> atoms;
+//       atoms.insert(i);
+//       for (map<int, int>::const_iterator atom1 = atomConstraints[i].begin(); atom1 != atomConstraints[i].end() && atoms.size() == 1; ++atom1) {
+//           for (map<int, int>::const_iterator atom2 = atomConstraints[atom1->first].begin(); atom2 != atomConstraints[atom1->first].end(); ++atom2) {
+//               if (atomConstraints[i].count(atom2->first) != 0) {
+//                   atoms.insert(atom1->first);
+//                   atoms.insert(atom2->first);
+//                   break;
+//               }
+//           }
+//       }
+//       if (atoms.size() == 1)
+//           continue;
+//
+//       // We have three atoms that are part of a cluster, so look for other atoms we can add.
+//
+//       bool done = false;
+//       while (!done) {
+//           done = true;
+//           for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
+//               for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2) {
+//                   if (atoms.find(atom2->first) != atoms.end())
+//                       continue; // This atom is already in the cluster.
+//
+//                   // See if this atom is linked to three other atoms in the cluster.
+//
+//                   int linkCount = 0;
+//                   for (map<int, int>::const_iterator atom3 = atomConstraints[atom2->first].begin(); atom3 != atomConstraints[atom2->first].end(); ++atom3)
+//                       if (atoms.find(atom3->first) != atoms.end())
+//                           linkCount++;
+//                   if (linkCount > 2) {
+//                       atoms.insert(atom2->first);
+//                       done = false;
+//                   }
+//               }
+//           }
+//       }
+//
+//       // Record the cluster.
+//
+//       vector<int> constraints;
+//       for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
+//           for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2) {
+//               if (*atom1 < atom2->first && atoms.find(atom2->first) != atoms.end())
+//                   constraints.push_back(atom2->second);
+//           }
+//       }
+//       _rigidClusters.push_back(constraints);
+//       for (set<int>::const_iterator atom1 = atoms.begin(); atom1 != atoms.end(); ++atom1) {
+//           for (map<int, int>::const_iterator atom2 = atomConstraints[*atom1].begin(); atom2 != atomConstraints[*atom1].end(); ++atom2)
+//               atomConstraints[atom2->first].erase(*atom1);
+//           atomConstraints[*atom1].clear();
+//       }
+//
+//       // Compute the constraint coupling matrix for this cluster.
+//
+//       const vector<int>& cluster = _rigidClusters[i];
+//       unsigned int size = cluster.size();
+//       Array2D<double> matrix(size, size);
+//       for (int j = 0; j < (int)size; j++) {
+//           for (int k = 0; k < (int)size; k++) {
+//               double scale;
+//               int atomj0 = _atomIndices[cluster[j]][0];
+//               int atomj1 = _atomIndices[cluster[j]][1];
+//               int atomk0 = _atomIndices[cluster[k]][0];
+//               int atomk1 = _atomIndices[cluster[k]][1];
+//               RealOpenMM invMass0 = one/masses[atomj0];
+//               RealOpenMM invMass1 = one/masses[atomj1];
+//               int atoma, atomb;
+//               if (atomj0 == atomk0) {
+//                   atoma = atomj1;
+//                   atomb = atomk1;
+//                   scale = invMass0/(invMass0+invMass1);
+//               }
+//               else if (atomj1 == atomk1) {
+//                   atoma = atomj0;
+//                   atomb = atomk0;
+//                   scale = invMass1/(invMass0+invMass1);
+//               }
+//               else if (atomj0 == atomk1) {
+//                   atoma = atomj1;
+//                   atomb = atomk0;
+//                   scale = invMass0/(invMass0+invMass1);
+//               }
+//               else if (atomj1 == atomk0) {
+//                   atoma = atomj0;
+//                   atomb = atomk1;
+//                   scale = invMass1/(invMass0+invMass1);
+//               }
+//               else {
+//                   matrix[j][k] = 0.0;
+//                   continue; // These constraints are not connected.
+//               }
+//
+//               // Find the third constraint forming a triangle with these two.
+//
+//               for (int m = 0; m < size; m++) {
+//                   int other = cluster[m];
+//                   if ((_atomIndices[other][0] == atoma && _atomIndices[other][1] == atomb) || (_atomIndices[other][0] == atomb && _atomIndices[other][1] == atoma)) {
+//                       double d1 = _distance[cluster[j]];
+//                       double d2 = _distance[cluster[k]];
+//                       double d3 = _distance[other];
+//                       matrix[j][k] = scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2);
+//                       break;
+//                   }
+//               }
+//           }
+//           matrix[j][j] = 1.0;
+//       }
+//
+//       // Invert it using SVD.
+//
+//       Array2D<double> u, v;
+//       Array1D<double> w;
+//       SVD<double> svd(matrix);
+//       svd.getU(u);
+//       svd.getV(v);
+//       svd.getSingularValues(w);
+//       double singularValueCutoff = 0.01*w[0];
+//       for (int j = 0; j < (int)size; j++)
+//           w[j] = (w[j] < singularValueCutoff ? 0.0 : 1.0/w[j]);
+//       for (int j = 0; j < (int)size; j++) {
+//           for (int k = 0; k < (int)size; k++) {
+//               matrix[j][k] = 0.0;
+//               for (int m = 0; m < (int)size; m++)
+//                   matrix[j][k] += v[j][m]*w[m]*u[k][m];
+//           }
+//       }
+//
+//       // Record the inverted matrix.
+//
+//       _matrices.push_back(SimTKOpenMMUtilities::allocateTwoDRealOpenMMArray(constraints.size(), constraints.size(), NULL, false, 0.0, ""));
+//       for (int j = 0; j < (int)size; j++)
+//           for (int k = 0; k < (int)size; k++)
+//               _matrices[i][j][k] = (RealOpenMM)matrix[j][k]*_distance[cluster[k]]/_distance[cluster[j]];
+//   }
 }
 /**---------------------------------------------------------------------------------------
@@ -265,8 +386,8 @@ ReferenceRigidShakeAlgorithm::~ReferenceRigidShakeAlgorithm( ){
       SimTKOpenMMUtilities::freeOneDRealOpenMMArray( _distanceTolerance, "distanceTolerance" );
       SimTKOpenMMUtilities::freeOneDRealOpenMMArray( _reducedMasses, "reducedMasses" );
    }
-    for (unsigned int i = 0; i < _matrices.size(); i++)
+//    for (unsigned int i = 0; i < _matrices.size(); i++)
-        SimTKOpenMMUtilities::freeTwoDRealOpenMMArray(_matrices[i], "");
+//        SimTKOpenMMUtilities::freeTwoDRealOpenMMArray(_matrices[i], "");
 }
 /**---------------------------------------------------------------------------------------
@@ -451,7 +572,7 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
   int iterations           = 0;
   int numberConverged      = 0;
   vector<RealOpenMM> constraintForce(_numberOfConstraints);
-   vector<RealOpenMM> tempForce(10);
+   vector<RealOpenMM> tempForce(_numberOfConstraints);
   while( !done && iterations++ < getMaximumNumberOfIterations() ){
      numberConverged  = 0;
      for( int ii = 0; ii < _numberOfConstraints; ii++ ){
@@ -482,22 +603,35 @@ int ReferenceRigidShakeAlgorithm::apply( int numberOfAtoms, RealOpenMM** atomCoo
      if( numberConverged == _numberOfConstraints ){
         done = true;
      }
-      for (unsigned int i = 0; i < _rigidClusters.size(); i++) {
-          const vector<int>& cluster = _rigidClusters[i];
-          RealOpenMM** matrix = _matrices[i];
-          unsigned int size = cluster.size();
-          if (size > tempForce.size())
-              tempForce.resize(size);
-          for (unsigned int j = 0; j < size; j++) {
-              tempForce[j] = zero;
-              for (unsigned int k = 0; k < size; k++)
-                  tempForce[j] += matrix[j][k]*constraintForce[cluster[k]];
-          }
-          for (unsigned int j = 0; j < size; j++)
-              constraintForce[cluster[j]] = tempForce[j];
+      if (_matrix.size() > 0) {
+          for (int i = 0; i < _numberOfConstraints; i++) {
+              RealOpenMM sum = 0.0;
+              for (int j = 0; j < (int) _matrix[i].size(); j++) {
+                  pair<int, RealOpenMM> element = _matrix[i][j];
+                  sum += element.second*constraintForce[element.first];
+              }
+              tempForce[i] = sum;
+          }
+          constraintForce = tempForce;
      }
-RealOpenMM damping = one;//(RealOpenMM) (iterations%2 == 0 ? 0.5 : 1.0);
+//      for (unsigned int i = 0; i < _rigidClusters.size(); i++) {
+//          const vector<int>& cluster = _rigidClusters[i];
+//          RealOpenMM** matrix = _matrices[i];
+//          unsigned int size = cluster.size();
+//          if (size > tempForce.size())
+//              tempForce.resize(size);
+//          for (unsigned int j = 0; j < size; j++) {
+//              tempForce[j] = zero;
+//              for (unsigned int k = 0; k < size; k++)
+//                  tempForce[j] += matrix[j][k]*constraintForce[cluster[k]];
+//          }
+//          for (unsigned int j = 0; j < size; j++)
+//              constraintForce[cluster[j]] = tempForce[j];
+//
+//      }
+RealOpenMM damping = one;//(RealOpenMM) (iterations < 2 ? 0.5 : 1.0);
      for( int ii = 0; ii < _numberOfConstraints; ii++ ){
         int atomI   = _atomIndices[ii][0];

--- a/platforms/reference/src/SimTKReference/ReferenceRigidShakeAlgorithm.h
+++ b/platforms/reference/src/SimTKReference/ReferenceRigidShakeAlgorithm.h
@@ -26,6 +26,7 @@
 #define __ReferenceQShakeAlgorithm_H__
 #include "ReferenceConstraintAlgorithm.h"
+#include <utility>
 #include <vector>
 #include <set>
@@ -47,10 +48,12 @@ class ReferenceRigidShakeAlgorithm : public ReferenceConstraintAlgorithm {
      RealOpenMM* _distanceTolerance;
      RealOpenMM* _reducedMasses;
      bool _hasInitializedMasses;
-      std::vector<std::vector<int> > _rigidClusters;
+//      std::vector<std::vector<int> > _rigidClusters;
-      std::vector<RealOpenMM**> _matrices;
+//      std::vector<RealOpenMM**> _matrices;
+      std::vector<std::vector<std::pair<int, RealOpenMM> > > _matrix;
   public:
+      class AngleInfo;
      /**---------------------------------------------------------------------------------------
@@ -64,7 +67,7 @@ class ReferenceRigidShakeAlgorithm : public ReferenceConstraintAlgorithm {
         --------------------------------------------------------------------------------------- */
-      ReferenceRigidShakeAlgorithm( int numberOfAtoms, int numberOfConstraints, int** atomIndices, RealOpenMM* distance, RealOpenMM* masses, RealOpenMM tolerance );
+      ReferenceRigidShakeAlgorithm( int numberOfAtoms, int numberOfConstraints, int** atomIndices, RealOpenMM* distance, RealOpenMM* masses, std::vector<AngleInfo>& angles, RealOpenMM tolerance );
      /**---------------------------------------------------------------------------------------
@@ -171,6 +174,17 @@ class ReferenceRigidShakeAlgorithm : public ReferenceConstraintAlgorithm {
      int reportShake( int numberOfAtoms, RealOpenMM** atomCoordinates, std::stringstream& message );
 };
+class ReferenceRigidShakeAlgorithm::AngleInfo
+{
+public:
+    int atom1, atom2, atom3;
+    RealOpenMM angle;
+    AngleInfo(int atom1, int atom2, int atom3, RealOpenMM angle) :
+        atom1(atom1), atom2(atom2), atom3(atom3), angle(angle)
+    {
+    }
+};
 // ---------------------------------------------------------------------------------------
 #endif // __ReferenceQShakeAlgorithm_H__