init the faiss for rocm

395d2ce6 · huchen · 5ded39f5 · 395d2ce6 · 395d2ce6 · 395d2ce6
Commit 395d2ce6 authored Jun 01, 2022 by huchen
20 changed files
--- a/contrib/rpc.py
+++ b/contrib/rpc.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Simplistic RPC implementation.
+Exposes all functions of a Server object.
+Uses pickle for serialization and the socket interface.
+"""
+import os,pdb,pickle,time,errno,sys,_thread,traceback,socket,threading,gc
+import logging
+LOG = logging.getLogger(__name__)
+# default
+PORT=12032
+#########################################################################
+# simple I/O functions
+def inline_send_handle(f, conn):
+    st = os.fstat(f.fileno())
+    size = st.st_size
+    pickle.dump(size, conn)
+    conn.write(f.read(size))
+def inline_send_string(s, conn):
+    size = len(s)
+    pickle.dump(size, conn)
+    conn.write(s)
+class FileSock:
+    " wraps a socket so that it is usable by pickle/cPickle "
+    def __init__(self,sock):
+        self.sock = sock
+        self.nr=0
+    def write(self, buf):
+        # print("sending %d bytes"%len(buf))
+        #self.sock.sendall(buf)
+        # print("...done")
+        bs = 512 * 1024
+        ns = 0
+        while ns < len(buf):
+            sent = self.sock.send(buf[ns:ns + bs])
+            ns += sent
+    def read(self,bs=512*1024):
+        #if self.nr==10000: pdb.set_trace()
+        self.nr+=1
+        # print("read bs=%d"%bs)
+        b = []
+        nb = 0
+        while len(b)<bs:
+            # print('   loop')
+            rb = self.sock.recv(bs - nb)
+            if not rb: break
+            b.append(rb)
+            nb += len(rb)
+        return b''.join(b)
+    def readline(self):
+        # print("readline!")
+        """may be optimized..."""
+        s=bytes()
+        while True:
+            c=self.read(1)
+            s+=c
+        if len(c)==0 or chr(c[0])=='\n':
+            return s
+class ClientExit(Exception):
+    pass
+class ServerException(Exception):
+    pass
+class Server:
+    """
+    server protocol. Methods from classes that subclass Server can be called
+    transparently from a client
+    """
+    def __init__(self, s, logf=sys.stderr, log_prefix=''):
+        self.logf = logf
+        self.log_prefix = log_prefix
+        # connection
+        self.conn = s
+        self.fs = FileSock(s)
+    def log(self, s):
+        self.logf.write("Sever log %s: %s\n" % (self.log_prefix, s))
+    def one_function(self):
+        """
+        Executes a single function with associated I/O.
+        Protocol:
+        - the arguments and results are serialized with the pickle protocol
+        - client sends : (fname,args)
+            fname = method name to call
+            args = tuple of arguments
+        - server sends result: (rid,st,ret)
+            rid = request id
+            st = None, or exception if there was during execution
+            ret = return value or None if st!=None
+        """
+        try:
+            (fname,args)=pickle.load(self.fs)
+        except EOFError:
+            raise ClientExit("read args")
+        self.log("executing method %s"%(fname))
+        st = None
+        ret = None
+        try:
+            f=getattr(self,fname)
+        except AttributeError:
+            st = AttributeError("unknown method "+fname)
+            self.log("unknown method ")
+        try:
+            ret = f(*args)
+        except Exception as e:
+            # due to a bug (in mod_python?), ServerException cannot be
+            # unpickled, so send the string and make the exception on the client side
+            #st=ServerException(
+            #  "".join(traceback.format_tb(sys.exc_info()[2]))+
+            #  str(e))
+            st="".join(traceback.format_tb(sys.exc_info()[2]))+str(e)
+            self.log("exception in method")
+            traceback.print_exc(50,self.logf)
+            self.logf.flush()
+        LOG.info("return")
+        try:
+            pickle.dump((st ,ret), self.fs, protocol=4)
+        except EOFError:
+            raise ClientExit("function return")
+    def exec_loop(self):
+        """ main execution loop. Loops and handles exit states"""
+        self.log("in exec_loop")
+        try:
+            while True:
+                self.one_function()
+        except ClientExit as e:
+            self.log("ClientExit %s"%e)
+        except socket.error as e:
+            self.log("socket error %s"%e)
+            traceback.print_exc(50,self.logf)
+        except EOFError:
+            self.log("EOF during communication")
+            traceback.print_exc(50,self.logf)
+        except BaseException:
+            # unexpected
+            traceback.print_exc(50,sys.stderr)
+            sys.exit(1)
+        LOG.info("exit sever")
+    def exec_loop_cleanup(self):
+        pass
+    ###################################################################
+    # spying stuff
+    def get_ps_stats(self):
+        ret=''
+        f=os.popen("echo ============ `hostname` uptime:; uptime;"+
+                   "echo ============ self:; "+
+                   "ps -p %d -o pid,vsize,rss,%%cpu,nlwp,psr; "%os.getpid()+
+                   "echo ============ run queue:;"+
+                   "ps ar -o user,pid,%cpu,%mem,ni,nlwp,psr,vsz,rss,cputime,command")
+        for l in f:
+            ret+=l
+        return ret
+class Client:
+    """
+    Methods of the server object can be called transparently. Exceptions are
+    re-raised.
+    """
+    def __init__(self, HOST, port=PORT, v6=False):
+        socktype = socket.AF_INET6 if v6 else socket.AF_INET
+        sock = socket.socket(socktype, socket.SOCK_STREAM)
+        LOG.info("connecting", HOST, port, socktype)
+        sock.connect((HOST, port))
+        self.sock = sock
+        self.fs = FileSock(sock)
+    def generic_fun(self, fname, args):
+        # int "gen fun",fname
+        pickle.dump((fname, args), self.fs, protocol=4)
+        return self.get_result()
+    def get_result(self):
+        (st, ret) = pickle.load(self.fs)
+        if st!=None:
+            raise ServerException(st)
+        else:
+            return ret
+    def __getattr__(self,name):
+        return lambda *x: self.generic_fun(name,x)
+def run_server(new_handler, port=PORT, report_to_file=None, v6=False):
+    HOST = ''                 # Symbolic name meaning the local host
+    socktype = socket.AF_INET6 if v6 else socket.AF_INET
+    s = socket.socket(socktype, socket.SOCK_STREAM)
+    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    LOG.info("bind %s:%d" % (HOST, port))
+    s.bind((HOST, port))
+    s.listen(5)
+    LOG.info("accepting connections")
+    if report_to_file is not None:
+        LOG.info('storing host+port in', report_to_file)
+        open(report_to_file, 'w').write('%s:%d ' % (socket.gethostname(), port))
+    while True:
+        try:
+            conn, addr = s.accept()
+        except socket.error as e:
+            if e[1]=='Interrupted system call': continue
+            raise
+        LOG.info('Connected by', addr, end=' ')
+        ibs = new_handler(conn)
+        tid = _thread.start_new_thread(ibs.exec_loop,())
+        LOG.info("tid",tid)
--- a/contrib/torch_utils.py
+++ b/contrib/torch_utils.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+This is a set of function wrappers that override the default numpy versions.
+Interoperability functions for pytorch and Faiss: Importing this will allow
+pytorch Tensors (CPU or GPU) to be used as arguments to Faiss indexes and
+other functions. Torch GPU tensors can only be used with Faiss GPU indexes.
+If this is imported with a package that supports Faiss GPU, the necessary
+stream synchronization with the current pytorch stream will be automatically
+performed.
+Numpy ndarrays can continue to be used in the Faiss python interface after
+importing this file. All arguments must be uniformly either numpy ndarrays
+or Torch tensors; no mixing is allowed.
+"""
+import faiss
+import torch
+import contextlib
+import inspect
+import sys
+import numpy as np
+def swig_ptr_from_UInt8Tensor(x):
+    """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
+    assert x.is_contiguous()
+    assert x.dtype == torch.uint8
+    return faiss.cast_integer_to_uint8_ptr(
+        x.storage().data_ptr() + x.storage_offset())
+def swig_ptr_from_HalfTensor(x):
+    """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
+    assert x.is_contiguous()
+    assert x.dtype == torch.float16
+    # no canonical half type in C/C++
+    return faiss.cast_integer_to_void_ptr(
+        x.storage().data_ptr() + x.storage_offset() * 4)
+def swig_ptr_from_FloatTensor(x):
+    """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
+    assert x.is_contiguous()
+    assert x.dtype == torch.float32
+    return faiss.cast_integer_to_float_ptr(
+        x.storage().data_ptr() + x.storage_offset() * 4)
+def swig_ptr_from_IntTensor(x):
+    """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
+    assert x.is_contiguous()
+    assert x.dtype == torch.int32, 'dtype=%s' % x.dtype
+    return faiss.cast_integer_to_int_ptr(
+        x.storage().data_ptr() + x.storage_offset() * 8)
+def swig_ptr_from_IndicesTensor(x):
+    """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
+    assert x.is_contiguous()
+    assert x.dtype == torch.int64, 'dtype=%s' % x.dtype
+    return faiss.cast_integer_to_idx_t_ptr(
+        x.storage().data_ptr() + x.storage_offset() * 8)
+@contextlib.contextmanager
+def using_stream(res, pytorch_stream=None):
+    """ Creates a scoping object to make Faiss GPU use the same stream
+        as pytorch, based on torch.cuda.current_stream().
+        Or, a specific pytorch stream can be passed in as a second
+        argument, in which case we will use that stream.
+    """
+    if pytorch_stream is None:
+        pytorch_stream = torch.cuda.current_stream()
+    # This is the cudaStream_t that we wish to use
+    cuda_stream_s = faiss.cast_integer_to_cudastream_t(pytorch_stream.cuda_stream)
+    # So we can revert GpuResources stream state upon exit
+    prior_dev = torch.cuda.current_device()
+    prior_stream = res.getDefaultStream(torch.cuda.current_device())
+    res.setDefaultStream(torch.cuda.current_device(), cuda_stream_s)
+    # Do the user work
+    try:
+        yield
+    finally:
+        res.setDefaultStream(prior_dev, prior_stream)
+def torch_replace_method(the_class, name, replacement,
+                         ignore_missing=False, ignore_no_base=False):
+    try:
+        orig_method = getattr(the_class, name)
+    except AttributeError:
+        if ignore_missing:
+            return
+        raise
+    if orig_method.__name__ == 'torch_replacement_' + name:
+        # replacement was done in parent class
+        return
+    # We should already have the numpy replacement methods patched
+    assert ignore_no_base or (orig_method.__name__ == 'replacement_' + name)
+    setattr(the_class, name + '_numpy', orig_method)
+    setattr(the_class, name, replacement)
+def handle_torch_Index(the_class):
+    def torch_replacement_add(self, x):
+        if type(x) is np.ndarray:
+            # forward to faiss __init__.py base method
+            return self.add_numpy(x)
+        assert type(x) is torch.Tensor
+        n, d = x.shape
+        assert d == self.d
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        if x.is_cuda:
+            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
+            # On the GPU, use proper stream ordering
+            with using_stream(self.getResources()):
+                self.add_c(n, x_ptr)
+        else:
+            # CPU torch
+            self.add_c(n, x_ptr)
+    def torch_replacement_add_with_ids(self, x, ids):
+        if type(x) is np.ndarray:
+            # forward to faiss __init__.py base method
+            return self.add_with_ids_numpy(x, ids)
+        assert type(x) is torch.Tensor
+        n, d = x.shape
+        assert d == self.d
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        assert type(ids) is torch.Tensor
+        assert ids.shape == (n, ), 'not same number of vectors as ids'
+        ids_ptr = swig_ptr_from_IndicesTensor(ids)
+        if x.is_cuda:
+            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
+            # On the GPU, use proper stream ordering
+            with using_stream(self.getResources()):
+                self.add_with_ids_c(n, x_ptr, ids_ptr)
+        else:
+            # CPU torch
+            self.add_with_ids_c(n, x_ptr, ids_ptr)
+    def torch_replacement_assign(self, x, k, labels=None):
+        if type(x) is np.ndarray:
+            # forward to faiss __init__.py base method
+            return self.assign_numpy(x, k, labels)
+        assert type(x) is torch.Tensor
+        n, d = x.shape
+        assert d == self.d
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        if labels is None:
+            labels = torch.empty(n, k, device=x.device, dtype=torch.int64)
+        else:
+            assert type(labels) is torch.Tensor
+            assert labels.shape == (n, k)
+        L_ptr = swig_ptr_from_IndicesTensor(labels)
+        if x.is_cuda:
+            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
+            # On the GPU, use proper stream ordering
+            with using_stream(self.getResources()):
+                self.assign_c(n, x_ptr, L_ptr, k)
+        else:
+            # CPU torch
+            self.assign_c(n, x_ptr, L_ptr, k)
+        return labels
+    def torch_replacement_train(self, x):
+        if type(x) is np.ndarray:
+            # forward to faiss __init__.py base method
+            return self.train_numpy(x)
+        assert type(x) is torch.Tensor
+        n, d = x.shape
+        assert d == self.d
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        if x.is_cuda:
+            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
+            # On the GPU, use proper stream ordering
+            with using_stream(self.getResources()):
+                self.train_c(n, x_ptr)
+        else:
+            # CPU torch
+            self.train_c(n, x_ptr)
+    def torch_replacement_search(self, x, k, D=None, I=None):
+        if type(x) is np.ndarray:
+            # forward to faiss __init__.py base method
+            return self.search_numpy(x, k, D, I)
+        assert type(x) is torch.Tensor
+        n, d = x.shape
+        assert d == self.d
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        if D is None:
+            D = torch.empty(n, k, device=x.device, dtype=torch.float32)
+        else:
+            assert type(D) is torch.Tensor
+            assert D.shape == (n, k)
+        D_ptr = swig_ptr_from_FloatTensor(D)
+        if I is None:
+            I = torch.empty(n, k, device=x.device, dtype=torch.int64)
+        else:
+            assert type(I) is torch.Tensor
+            assert I.shape == (n, k)
+        I_ptr = swig_ptr_from_IndicesTensor(I)
+        if x.is_cuda:
+            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
+            # On the GPU, use proper stream ordering
+            with using_stream(self.getResources()):
+                self.search_c(n, x_ptr, k, D_ptr, I_ptr)
+        else:
+            # CPU torch
+            self.search_c(n, x_ptr, k, D_ptr, I_ptr)
+        return D, I
+    def torch_replacement_search_and_reconstruct(self, x, k, D=None, I=None, R=None):
+        if type(x) is np.ndarray:
+            # Forward to faiss __init__.py base method
+            return self.search_and_reconstruct_numpy(x, k, D, I, R)
+        assert type(x) is torch.Tensor
+        n, d = x.shape
+        assert d == self.d
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        if D is None:
+            D = torch.empty(n, k, device=x.device, dtype=torch.float32)
+        else:
+            assert type(D) is torch.Tensor
+            assert D.shape == (n, k)
+        D_ptr = swig_ptr_from_FloatTensor(D)
+        if I is None:
+            I = torch.empty(n, k, device=x.device, dtype=torch.int64)
+        else:
+            assert type(I) is torch.Tensor
+            assert I.shape == (n, k)
+        I_ptr = swig_ptr_from_IndicesTensor(I)
+        if R is None:
+            R = torch.empty(n, k, d, device=x.device, dtype=torch.float32)
+        else:
+            assert type(R) is torch.Tensor
+            assert R.shape == (n, k, d)
+        R_ptr = swig_ptr_from_FloatTensor(R)
+        if x.is_cuda:
+            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
+            # On the GPU, use proper stream ordering
+            with using_stream(self.getResources()):
+                self.search_and_reconstruct_c(n, x_ptr, k, D_ptr, I_ptr, R_ptr)
+        else:
+            # CPU torch
+            self.search_and_reconstruct_c(n, x_ptr, k, D_ptr, I_ptr, R_ptr)
+        return D, I, R
+    def torch_replacement_remove_ids(self, x):
+        # Not yet implemented
+        assert type(x) is not torch.Tensor, 'remove_ids not yet implemented for torch'
+        return self.remove_ids_numpy(x)
+    def torch_replacement_reconstruct(self, key, x=None):
+        # No tensor inputs are required, but with importing this module, we
+        # assume that the default should be torch tensors. If we are passed a
+        # numpy array, however, assume that the user is overriding this default
+        if (x is not None) and (type(x) is np.ndarray):
+            # Forward to faiss __init__.py base method
+            return self.reconstruct_numpy(key, x)
+        # If the index is a CPU index, the default device is CPU, otherwise we
+        # produce a GPU tensor
+        device = torch.device('cpu')
+        if hasattr(self, 'getDevice'):
+            # same device as the index
+            device = torch.device('cuda', self.getDevice())
+        if x is None:
+            x = torch.empty(self.d, device=device, dtype=torch.float32)
+        else:
+            assert type(x) is torch.Tensor
+            assert x.shape == (self.d, )
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        if x.is_cuda:
+            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
+            # On the GPU, use proper stream ordering
+            with using_stream(self.getResources()):
+                self.reconstruct_c(key, x_ptr)
+        else:
+            # CPU torch
+            self.reconstruct_c(key, x_ptr)
+        return x
+    def torch_replacement_reconstruct_n(self, n0, ni, x=None):
+        # No tensor inputs are required, but with importing this module, we
+        # assume that the default should be torch tensors. If we are passed a
+        # numpy array, however, assume that the user is overriding this default
+        if (x is not None) and (type(x) is np.ndarray):
+            # Forward to faiss __init__.py base method
+            return self.reconstruct_n_numpy(n0, ni, x)
+        # If the index is a CPU index, the default device is CPU, otherwise we
+        # produce a GPU tensor
+        device = torch.device('cpu')
+        if hasattr(self, 'getDevice'):
+            # same device as the index
+            device = torch.device('cuda', self.getDevice())
+        if x is None:
+            x = torch.empty(ni, self.d, device=device, dtype=torch.float32)
+        else:
+            assert type(x) is torch.Tensor
+            assert x.shape == (ni, self.d)
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        if x.is_cuda:
+            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
+            # On the GPU, use proper stream ordering
+            with using_stream(self.getResources()):
+                self.reconstruct_n_c(n0, ni, x_ptr)
+        else:
+            # CPU torch
+            self.reconstruct_n_c(n0, ni, x_ptr)
+        return x
+    def torch_replacement_update_vectors(self, keys, x):
+        if type(keys) is np.ndarray:
+            # Forward to faiss __init__.py base method
+            return self.update_vectors_numpy(keys, x)
+        assert type(keys) is torch.Tensor
+        (n, ) = keys.shape
+        keys_ptr = swig_ptr_from_IndicesTensor(keys)
+        assert type(x) is torch.Tensor
+        assert x.shape == (n, self.d)
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        if x.is_cuda:
+            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
+            # On the GPU, use proper stream ordering
+            with using_stream(self.getResources()):
+                self.update_vectors_c(n, keys_ptr, x_ptr)
+        else:
+            # CPU torch
+            self.update_vectors_c(n, keys_ptr, x_ptr)
+    # Until the GPU version is implemented, we do not support pre-allocated
+    # output buffers
+    def torch_replacement_range_search(self, x, thresh):
+        if type(x) is np.ndarray:
+            # Forward to faiss __init__.py base method
+            return self.range_search_numpy(x, thresh)
+        assert type(x) is torch.Tensor
+        n, d = x.shape
+        assert d == self.d
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        assert not x.is_cuda, 'Range search using GPU tensor not yet implemented'
+        assert not hasattr(self, 'getDevice'), 'Range search on GPU index not yet implemented'
+        res = faiss.RangeSearchResult(n)
+        self.range_search_c(n, x_ptr, thresh, res)
+        # get pointers and copy them
+        # FIXME: no rev_swig_ptr equivalent for torch.Tensor, just convert
+        # np to torch
+        # NOTE: torch does not support np.uint64, just np.int64
+        lims = torch.from_numpy(faiss.rev_swig_ptr(res.lims, n + 1).copy().astype('int64'))
+        nd = int(lims[-1])
+        D = torch.from_numpy(faiss.rev_swig_ptr(res.distances, nd).copy())
+        I = torch.from_numpy(faiss.rev_swig_ptr(res.labels, nd).copy())
+        return lims, D, I
+    def torch_replacement_sa_encode(self, x, codes=None):
+        if type(x) is np.ndarray:
+            # Forward to faiss __init__.py base method
+            return self.sa_encode_numpy(x, codes)
+        assert type(x) is torch.Tensor
+        n, d = x.shape
+        assert d == self.d
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        if codes is None:
+            codes = torch.empty(n, self.sa_code_size(), dtype=torch.uint8)
+        else:
+            assert codes.shape == (n, self.sa_code_size())
+        codes_ptr = swig_ptr_from_UInt8Tensor(codes)
+        if x.is_cuda:
+            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
+            # On the GPU, use proper stream ordering
+            with using_stream(self.getResources()):
+                self.sa_encode_c(n, x_ptr, codes_ptr)
+        else:
+            # CPU torch
+            self.sa_encode_c(n, x_ptr, codes_ptr)
+        return codes
+    def torch_replacement_sa_decode(self, codes, x=None):
+        if type(codes) is np.ndarray:
+            # Forward to faiss __init__.py base method
+            return self.sa_decode_numpy(codes, x)
+        assert type(codes) is torch.Tensor
+        n, cs = codes.shape
+        assert cs == self.sa_code_size()
+        codes_ptr = swig_ptr_from_UInt8Tensor(codes)
+        if x is None:
+            x = torch.empty(n, self.d, dtype=torch.float32)
+        else:
+            assert type(x) is torch.Tensor
+            assert x.shape == (n, self.d)
+        x_ptr = swig_ptr_from_FloatTensor(x)
+        if codes.is_cuda:
+            assert hasattr(self, 'getDevice'), 'GPU tensor on CPU index not allowed'
+            # On the GPU, use proper stream ordering
+            with using_stream(self.getResources()):
+                self.sa_decode_c(n, codes_ptr, x_ptr)
+        else:
+            # CPU torch
+            self.sa_decode_c(n, codes_ptr, x_ptr)
+        return x
+    torch_replace_method(the_class, 'add', torch_replacement_add)
+    torch_replace_method(the_class, 'add_with_ids', torch_replacement_add_with_ids)
+    torch_replace_method(the_class, 'assign', torch_replacement_assign)
+    torch_replace_method(the_class, 'train', torch_replacement_train)
+    torch_replace_method(the_class, 'search', torch_replacement_search)
+    torch_replace_method(the_class, 'remove_ids', torch_replacement_remove_ids)
+    torch_replace_method(the_class, 'reconstruct', torch_replacement_reconstruct)
+    torch_replace_method(the_class, 'reconstruct_n', torch_replacement_reconstruct_n)
+    torch_replace_method(the_class, 'range_search', torch_replacement_range_search)
+    torch_replace_method(the_class, 'update_vectors', torch_replacement_update_vectors,
+                         ignore_missing=True)
+    torch_replace_method(the_class, 'search_and_reconstruct',
+                         torch_replacement_search_and_reconstruct, ignore_missing=True)
+    torch_replace_method(the_class, 'sa_encode', torch_replacement_sa_encode)
+    torch_replace_method(the_class, 'sa_decode', torch_replacement_sa_decode)
+faiss_module = sys.modules['faiss']
+# Re-patch anything that inherits from faiss.Index to add the torch bindings
+for symbol in dir(faiss_module):
+    obj = getattr(faiss_module, symbol)
+    if inspect.isclass(obj):
+        the_class = obj
+        if issubclass(the_class, faiss.Index):
+            handle_torch_Index(the_class)
+# allows torch tensor usage with bfKnn
+def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2):
+    if type(xb) is np.ndarray:
+        # Forward to faiss __init__.py base method
+        return faiss.knn_gpu_numpy(res, xq, xb, k, D, I, metric)
+    nb, d = xb.size()
+    if xb.is_contiguous():
+        xb_row_major = True
+    elif xb.t().is_contiguous():
+        xb = xb.t()
+        xb_row_major = False
+    else:
+        raise TypeError('matrix should be row or column-major')
+    if xb.dtype == torch.float32:
+        xb_type = faiss.DistanceDataType_F32
+        xb_ptr = swig_ptr_from_FloatTensor(xb)
+    elif xb.dtype == torch.float16:
+        xb_type = faiss.DistanceDataType_F16
+        xb_ptr = swig_ptr_from_HalfTensor(xb)
+    else:
+        raise TypeError('xb must be f32 or f16')
+    nq, d2 = xq.size()
+    assert d2 == d
+    if xq.is_contiguous():
+        xq_row_major = True
+    elif xq.t().is_contiguous():
+        xq = xq.t()
+        xq_row_major = False
+    else:
+        raise TypeError('matrix should be row or column-major')
+    if xq.dtype == torch.float32:
+        xq_type = faiss.DistanceDataType_F32
+        xq_ptr = swig_ptr_from_FloatTensor(xq)
+    elif xq.dtype == torch.float16:
+        xq_type = faiss.DistanceDataType_F16
+        xq_ptr = swig_ptr_from_HalfTensor(xq)
+    else:
+        raise TypeError('xq must be f32 or f16')
+    if D is None:
+        D = torch.empty(nq, k, device=xb.device, dtype=torch.float32)
+    else:
+        assert D.shape == (nq, k)
+        # interface takes void*, we need to check this
+        assert (D.dtype == torch.float32)
+    if I is None:
+        I = torch.empty(nq, k, device=xb.device, dtype=torch.int64)
+    else:
+        assert I.shape == (nq, k)
+    if I.dtype == torch.int64:
+        I_type = faiss.IndicesDataType_I64
+        I_ptr = swig_ptr_from_IndicesTensor(I)
+    elif I.dtype == I.dtype == torch.int32:
+        I_type = faiss.IndicesDataType_I32
+        I_ptr = swig_ptr_from_IntTensor(I)
+    else:
+        raise TypeError('I must be i64 or i32')
+    D_ptr = swig_ptr_from_FloatTensor(D)
+    args = faiss.GpuDistanceParams()
+    args.metric = metric
+    args.k = k
+    args.dims = d
+    args.vectors = xb_ptr
+    args.vectorsRowMajor = xb_row_major
+    args.vectorType = xb_type
+    args.numVectors = nb
+    args.queries = xq_ptr
+    args.queriesRowMajor = xq_row_major
+    args.queryType = xq_type
+    args.numQueries = nq
+    args.outDistances = D_ptr
+    args.outIndices = I_ptr
+    args.outIndicesType = I_type
+    with using_stream(res):
+        faiss.bfKnn(res, args)
+    return D, I
+torch_replace_method(faiss_module, 'knn_gpu', torch_replacement_knn_gpu, True, True)
+# allows torch tensor usage with bfKnn for all pairwise distances
+def torch_replacement_pairwise_distance_gpu(res, xq, xb, D=None, metric=faiss.METRIC_L2):
+    if type(xb) is np.ndarray:
+        # Forward to faiss __init__.py base method
+        return faiss.pairwise_distance_gpu_numpy(res, xq, xb, D, metric)
+    nb, d = xb.size()
+    if xb.is_contiguous():
+        xb_row_major = True
+    elif xb.t().is_contiguous():
+        xb = xb.t()
+        xb_row_major = False
+    else:
+        raise TypeError('xb matrix should be row or column-major')
+    if xb.dtype == torch.float32:
+        xb_type = faiss.DistanceDataType_F32
+        xb_ptr = swig_ptr_from_FloatTensor(xb)
+    elif xb.dtype == torch.float16:
+        xb_type = faiss.DistanceDataType_F16
+        xb_ptr = swig_ptr_from_HalfTensor(xb)
+    else:
+        raise TypeError('xb must be float32 or float16')
+    nq, d2 = xq.size()
+    assert d2 == d
+    if xq.is_contiguous():
+        xq_row_major = True
+    elif xq.t().is_contiguous():
+        xq = xq.t()
+        xq_row_major = False
+    else:
+        raise TypeError('xq matrix should be row or column-major')
+    if xq.dtype == torch.float32:
+        xq_type = faiss.DistanceDataType_F32
+        xq_ptr = swig_ptr_from_FloatTensor(xq)
+    elif xq.dtype == torch.float16:
+        xq_type = faiss.DistanceDataType_F16
+        xq_ptr = swig_ptr_from_HalfTensor(xq)
+    else:
+        raise TypeError('xq must be float32 or float16')
+    if D is None:
+        D = torch.empty(nq, nb, device=xb.device, dtype=torch.float32)
+    else:
+        assert D.shape == (nq, nb)
+        # interface takes void*, we need to check this
+        assert (D.dtype == torch.float32)
+    D_ptr = swig_ptr_from_FloatTensor(D)
+    args = faiss.GpuDistanceParams()
+    args.metric = metric
+    args.k = -1 # selects all pairwise distance
+    args.dims = d
+    args.vectors = xb_ptr
+    args.vectorsRowMajor = xb_row_major
+    args.vectorType = xb_type
+    args.numVectors = nb
+    args.queries = xq_ptr
+    args.queriesRowMajor = xq_row_major
+    args.queryType = xq_type
+    args.numQueries = nq
+    args.outDistances = D_ptr
+    with using_stream(res):
+        faiss.bfKnn(res, args)
+    return D
+torch_replace_method(faiss_module, 'pairwise_distance_gpu', torch_replacement_pairwise_distance_gpu, True, True)
--- a/contrib/vecs_io.py
+++ b/contrib/vecs_io.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+"""
+I/O functions in fvecs, bvecs, ivecs formats
+definition of the formats here: http://corpus-texmex.irisa.fr/
+"""
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+def bvecs_mmap(fname):
+    x = np.memmap(fname, dtype='uint8', mode='r')
+    d = x[:4].view('int32')[0]
+    return x.reshape(-1, d + 4)[:, 4:]
+def ivecs_write(fname, m):
+    n, d = m.shape
+    m1 = np.empty((n, d + 1), dtype='int32')
+    m1[:, 0] = d
+    m1[:, 1:] = m
+    m1.tofile(fname)
+def fvecs_write(fname, m):
+    m = m.astype('float32')
+    ivecs_write(fname, m.view('int32'))
--- a/demos/CMakeLists.txt
+++ b/demos/CMakeLists.txt
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+add_executable(demo_imi_flat EXCLUDE_FROM_ALL demo_imi_flat.cpp)
+target_link_libraries(demo_imi_flat PRIVATE faiss)
+add_executable(demo_imi_pq EXCLUDE_FROM_ALL demo_imi_pq.cpp)
+target_link_libraries(demo_imi_pq PRIVATE faiss)
+add_executable(demo_ivfpq_indexing EXCLUDE_FROM_ALL demo_ivfpq_indexing.cpp)
+target_link_libraries(demo_ivfpq_indexing PRIVATE faiss)
+add_executable(demo_nndescent EXCLUDE_FROM_ALL demo_nndescent.cpp)
+target_link_libraries(demo_nndescent PRIVATE faiss)
+add_executable(demo_sift1M EXCLUDE_FROM_ALL demo_sift1M.cpp)
+target_link_libraries(demo_sift1M PRIVATE faiss)
+add_executable(demo_weighted_kmeans EXCLUDE_FROM_ALL demo_weighted_kmeans.cpp)
+target_link_libraries(demo_weighted_kmeans PRIVATE faiss)
--- a/demos/README.md
+++ b/demos/README.md
+Demos for a few Faiss functionalities
+=====================================
+demo_auto_tune.py
+-----------------
+Demonstrates the auto-tuning functionality of Faiss
+demo_ondisk_ivf.py
+------------------
+Shows how to construct a Faiss index that stores the inverted file
+data on disk, eg. when it does not fit in RAM. The script works on a
+small dataset (sift1M) for demonstration and proceeds in stages:
+0: train on the dataset
+1-4: build 4 indexes, each containing 1/4 of the dataset. This can be
+done in parallel on several machines
+5: merge the 4 indexes into one that is written directly to disk
+(needs not to fit in RAM)
+6: load and test the index
--- a/demos/demo_auto_tune.py
+++ b/demos/demo_auto_tune.py
+#!/usr/bin/env python2
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import print_function
+import os
+import time
+import numpy as np
+try:
+    import matplotlib
+    matplotlib.use('Agg')
+    from matplotlib import pyplot
+    graphical_output = True
+except ImportError:
+    graphical_output = False
+import faiss
+#################################################################
+# Small I/O functions
+#################################################################
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype="int32")
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+def plot_OperatingPoints(ops, nq, **kwargs):
+    ops = ops.optimal_pts
+    n = ops.size() * 2 - 1
+    pyplot.plot([ops.at( i      // 2).perf for i in range(n)],
+                [ops.at((i + 1) // 2).t / nq * 1000 for i in range(n)],
+                **kwargs)
+#################################################################
+# prepare common data for all indexes
+#################################################################
+t0 = time.time()
+print("load data")
+xt = fvecs_read("sift1M/sift_learn.fvecs")
+xb = fvecs_read("sift1M/sift_base.fvecs")
+xq = fvecs_read("sift1M/sift_query.fvecs")
+d = xt.shape[1]
+print("load GT")
+gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
+gt = gt.astype('int64')
+k = gt.shape[1]
+print("prepare criterion")
+# criterion = 1-recall at 1
+crit = faiss.OneRecallAtRCriterion(xq.shape[0], 1)
+crit.set_groundtruth(None, gt)
+crit.nnn = k
+# indexes that are useful when there is no limitation on memory usage
+unlimited_mem_keys = [
+    "IMI2x10,Flat", "IMI2x11,Flat",
+    "IVF4096,Flat", "IVF16384,Flat",
+    "PCA64,IMI2x10,Flat"]
+# memory limited to 16 bytes / vector
+keys_mem_16 = [
+    'IMI2x10,PQ16', 'IVF4096,PQ16',
+    'IMI2x10,PQ8+8', 'OPQ16_64,IMI2x10,PQ16'
+    ]
+# limited to 32 bytes / vector
+keys_mem_32 = [
+    'IMI2x10,PQ32', 'IVF4096,PQ32', 'IVF16384,PQ32',
+    'IMI2x10,PQ16+16',
+    'OPQ32,IVF4096,PQ32', 'IVF4096,PQ16+16', 'OPQ16,IMI2x10,PQ16+16'
+    ]
+# indexes that can run on the GPU
+keys_gpu = [
+    "PCA64,IVF4096,Flat",
+    "PCA64,Flat", "Flat", "IVF4096,Flat", "IVF16384,Flat",
+    "IVF4096,PQ32"]
+keys_to_test = unlimited_mem_keys
+use_gpu = False
+if use_gpu:
+    # if this fails, it means that the GPU version was not comp
+    assert faiss.StandardGpuResources, \
+        "FAISS was not compiled with GPU support, or loading _swigfaiss_gpu.so failed"
+    res = faiss.StandardGpuResources()
+    dev_no = 0
+# remember results from other index types
+op_per_key = []
+# keep track of optimal operating points seen so far
+op = faiss.OperatingPoints()
+for index_key in keys_to_test:
+    print("============ key", index_key)
+    # make the index described by the key
+    index = faiss.index_factory(d, index_key)
+    if use_gpu:
+        # transfer to GPU (may be partial)
+        index = faiss.index_cpu_to_gpu(res, dev_no, index)
+        params = faiss.GpuParameterSpace()
+    else:
+        params = faiss.ParameterSpace()
+    params.initialize(index)
+    print("[%.3f s] train & add" % (time.time() - t0))
+    index.train(xt)
+    index.add(xb)
+    print("[%.3f s] explore op points" % (time.time() - t0))
+    # find operating points for this index
+    opi = params.explore(index, xq, crit)
+    print("[%.3f s] result operating points:" % (time.time() - t0))
+    opi.display()
+    # update best operating points so far
+    op.merge_with(opi, index_key + " ")
+    op_per_key.append((index_key, opi))
+    if graphical_output:
+        # graphical output (to tmp/ subdirectory)
+        fig = pyplot.figure(figsize=(12, 9))
+        pyplot.xlabel("1-recall at 1")
+        pyplot.ylabel("search time (ms/query, %d threads)" % faiss.omp_get_max_threads())
+        pyplot.gca().set_yscale('log')
+        pyplot.grid()
+        for i2, opi2 in op_per_key:
+            plot_OperatingPoints(opi2, crit.nq, label = i2, marker = 'o')
+        # plot_OperatingPoints(op, crit.nq, label = 'best', marker = 'o', color = 'r')
+        pyplot.legend(loc=2)
+        fig.savefig('tmp/demo_auto_tune.png')
+print("[%.3f s] final result:" % (time.time() - t0))
+op.display()
--- a/demos/demo_client_server_ivf.py
+++ b/demos/demo_client_server_ivf.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+import numpy as np
+import faiss
+from faiss.contrib.client_server import run_index_server, ClientIndex
+#################################################################
+# Small I/O functions
+#################################################################
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+#################################################################
+#  Main program
+#################################################################
+stage = int(sys.argv[1])
+tmpdir = '/tmp/'
+if stage == 0:
+    # train the index
+    xt = fvecs_read("sift1M/sift_learn.fvecs")
+    index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
+    print("training index")
+    index.train(xt)
+    print("write " + tmpdir + "trained.index")
+    faiss.write_index(index, tmpdir + "trained.index")
+if 1 <= stage <= 4:
+    # add 1/4 of the database to 4 independent indexes
+    bno = stage - 1
+    xb = fvecs_read("sift1M/sift_base.fvecs")
+    i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
+    index = faiss.read_index(tmpdir + "trained.index")
+    print("adding vectors %d:%d" % (i0, i1))
+    index.add_with_ids(xb[i0:i1], np.arange(i0, i1))
+    print("write " + tmpdir + "block_%d.index" % bno)
+    faiss.write_index(index, tmpdir + "block_%d.index" % bno)
+machine_ports = [
+    ('localhost', 12010),
+    ('localhost', 12011),
+    ('localhost', 12012),
+    ('localhost', 12013),
+]
+v6 = False
+if 5 <= stage <= 8:
+    # load an index slice and launch index
+    bno = stage - 5
+    fname = tmpdir + "block_%d.index" % bno
+    print("read " + fname)
+    index = faiss.read_index(fname)
+    port = machine_ports[bno][1]
+    run_index_server(index, port, v6=v6)
+if stage == 9:
+    client_index = ClientIndex(machine_ports)
+    print('index size:', client_index.ntotal)
+    client_index.set_nprobe(16)
+    # load query vectors and ground-truth
+    xq = fvecs_read("sift1M/sift_query.fvecs")
+    gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
+    D, I = client_index.search(xq, 5)
+    recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(xq.shape[0])
+    print("recall@1: %.3f" % recall_at_1)
--- a/demos/demo_imi_flat.cpp
+++ b/demos/demo_imi_flat.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+#include <sys/time.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/index_io.h>
+double elapsed() {
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return tv.tv_sec + tv.tv_usec * 1e-6;
+}
+int main() {
+    double t0 = elapsed();
+    // dimension of the vectors to index
+    int d = 128;
+    // size of the database we plan to index
+    size_t nb = 1000 * 1000;
+    // make a set of nt training vectors in the unit cube
+    // (could be the database)
+    size_t nt = 100 * 1000;
+    //---------------------------------------------------------------
+    // Define the core quantizer
+    // We choose a multiple inverted index for faster training with less data
+    // and because it usually offers best accuracy/speed trade-offs
+    //
+    // We here assume that its lifespan of this coarse quantizer will cover the
+    // lifespan of the inverted-file quantizer IndexIVFFlat below
+    // With dynamic allocation, one may give the responsability to free the
+    // quantizer to the inverted-file index (with attribute do_delete_quantizer)
+    //
+    // Note: a regular clustering algorithm would be defined as:
+    //       faiss::IndexFlatL2 coarse_quantizer (d);
+    //
+    // Use nhash=2 subquantizers used to define the product coarse quantizer
+    // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
+    //                 meaning (2^12)^nhash distinct inverted lists
+    size_t nhash = 2;
+    size_t nbits_subq = int(log2(nb + 1) / 2);     // good choice in general
+    size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
+    faiss::MultiIndexQuantizer coarse_quantizer(d, nhash, nbits_subq);
+    printf("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
+           nhash,
+           nbits_subq,
+           ncentroids,
+           nb);
+    // the coarse quantizer should not be dealloced before the index
+    // 4 = nb of bytes per code (d must be a multiple of this)
+    // 8 = nb of bits per sub-code (almost always 8)
+    faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
+    faiss::IndexIVFFlat index(&coarse_quantizer, d, ncentroids, metric);
+    index.quantizer_trains_alone = true;
+    // define the number of probes. 2048 is for high-dim, overkilled in practice
+    // Use 4-1024 depending on the trade-off speed accuracy that you want
+    index.nprobe = 2048;
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+    { // training
+        printf("[%.3f s] Generating %ld vectors in %dD for training\n",
+               elapsed() - t0,
+               nt,
+               d);
+        std::vector<float> trainvecs(nt * d);
+        for (size_t i = 0; i < nt * d; i++) {
+            trainvecs[i] = distrib(rng);
+        }
+        printf("[%.3f s] Training the index\n", elapsed() - t0);
+        index.verbose = true;
+        index.train(nt, trainvecs.data());
+    }
+    size_t nq;
+    std::vector<float> queries;
+    { // populating the database
+        printf("[%.3f s] Building a dataset of %ld vectors to index\n",
+               elapsed() - t0,
+               nb);
+        std::vector<float> database(nb * d);
+        for (size_t i = 0; i < nb * d; i++) {
+            database[i] = distrib(rng);
+        }
+        printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
+        index.add(nb, database.data());
+        // remember a few elements from the database as queries
+        int i0 = 1234;
+        int i1 = 1244;
+        nq = i1 - i0;
+        queries.resize(nq * d);
+        for (int i = i0; i < i1; i++) {
+            for (int j = 0; j < d; j++) {
+                queries[(i - i0) * d + j] = database[i * d + j];
+            }
+        }
+    }
+    { // searching the database
+        int k = 5;
+        printf("[%.3f s] Searching the %d nearest neighbors "
+               "of %ld vectors in the index\n",
+               elapsed() - t0,
+               k,
+               nq);
+        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<float> dis(k * nq);
+        index.search(nq, queries.data(), k, dis.data(), nns.data());
+        printf("[%.3f s] Query results (vector ids, then distances):\n",
+               elapsed() - t0);
+        for (int i = 0; i < nq; i++) {
+            printf("query %2d: ", i);
+            for (int j = 0; j < k; j++) {
+                printf("%7ld ", nns[j + i * k]);
+            }
+            printf("\n     dis: ");
+            for (int j = 0; j < k; j++) {
+                printf("%7g ", dis[j + i * k]);
+            }
+            printf("\n");
+        }
+    }
+    return 0;
+}
--- a/demos/demo_imi_pq.cpp
+++ b/demos/demo_imi_pq.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+#include <sys/time.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/index_io.h>
+double elapsed() {
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return tv.tv_sec + tv.tv_usec * 1e-6;
+}
+int main() {
+    double t0 = elapsed();
+    // dimension of the vectors to index
+    int d = 64;
+    // size of the database we plan to index
+    size_t nb = 1000 * 1000;
+    size_t add_bs = 10000; // # size of the blocks to add
+    // make a set of nt training vectors in the unit cube
+    // (could be the database)
+    size_t nt = 100 * 1000;
+    //---------------------------------------------------------------
+    // Define the core quantizer
+    // We choose a multiple inverted index for faster training with less data
+    // and because it usually offers best accuracy/speed trade-offs
+    //
+    // We here assume that its lifespan of this coarse quantizer will cover the
+    // lifespan of the inverted-file quantizer IndexIVFFlat below
+    // With dynamic allocation, one may give the responsability to free the
+    // quantizer to the inverted-file index (with attribute do_delete_quantizer)
+    //
+    // Note: a regular clustering algorithm would be defined as:
+    //       faiss::IndexFlatL2 coarse_quantizer (d);
+    //
+    // Use nhash=2 subquantizers used to define the product coarse quantizer
+    // Number of bits: we will have 2^nbits_coarse centroids per subquantizer
+    //                 meaning (2^12)^nhash distinct inverted lists
+    //
+    // The parameter bytes_per_code is determined by the memory
+    // constraint, the dataset will use nb * (bytes_per_code + 8)
+    // bytes.
+    //
+    // The parameter nbits_subq is determined by the size of the dataset to
+    // index.
+    //
+    size_t nhash = 2;
+    size_t nbits_subq = 9;
+    size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids
+    int bytes_per_code = 16;
+    faiss::MultiIndexQuantizer coarse_quantizer(d, nhash, nbits_subq);
+    printf("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)",
+           nhash,
+           nbits_subq,
+           ncentroids,
+           nb);
+    // the coarse quantizer should not be dealloced before the index
+    // 4 = nb of bytes per code (d must be a multiple of this)
+    // 8 = nb of bits per sub-code (almost always 8)
+    faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
+    faiss::IndexIVFPQ index(
+            &coarse_quantizer, d, ncentroids, bytes_per_code, 8);
+    index.quantizer_trains_alone = true;
+    // define the number of probes. 2048 is for high-dim, overkill in practice
+    // Use 4-1024 depending on the trade-off speed accuracy that you want
+    index.nprobe = 2048;
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+    { // training.
+        // The distribution of the training vectors should be the same
+        // as the database vectors. It could be a sub-sample of the
+        // database vectors, if sampling is not biased. Here we just
+        // randomly generate the vectors.
+        printf("[%.3f s] Generating %ld vectors in %dD for training\n",
+               elapsed() - t0,
+               nt,
+               d);
+        std::vector<float> trainvecs(nt * d);
+        for (size_t i = 0; i < nt; i++) {
+            for (size_t j = 0; j < d; j++) {
+                trainvecs[i * d + j] = distrib(rng);
+            }
+        }
+        printf("[%.3f s] Training the index\n", elapsed() - t0);
+        index.verbose = true;
+        index.train(nt, trainvecs.data());
+    }
+    // the index can be re-loaded later with
+    // faiss::Index * idx = faiss::read_index("/tmp/trained_index.faissindex");
+    faiss::write_index(&index, "/tmp/trained_index.faissindex");
+    size_t nq;
+    std::vector<float> queries;
+    { // populating the database
+        printf("[%.3f s] Building a dataset of %ld vectors to index\n",
+               elapsed() - t0,
+               nb);
+        std::vector<float> database(nb * d);
+        std::vector<faiss::Index::idx_t> ids(nb);
+        for (size_t i = 0; i < nb; i++) {
+            for (size_t j = 0; j < d; j++) {
+                database[i * d + j] = distrib(rng);
+            }
+            ids[i] = 8760000000L + i;
+        }
+        printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
+        for (size_t begin = 0; begin < nb; begin += add_bs) {
+            size_t end = std::min(begin + add_bs, nb);
+            index.add_with_ids(
+                    end - begin,
+                    database.data() + d * begin,
+                    ids.data() + begin);
+        }
+        // remember a few elements from the database as queries
+        int i0 = 1234;
+        int i1 = 1244;
+        nq = i1 - i0;
+        queries.resize(nq * d);
+        for (int i = i0; i < i1; i++) {
+            for (int j = 0; j < d; j++) {
+                queries[(i - i0) * d + j] = database[i * d + j];
+            }
+        }
+    }
+    // A few notes on the internal format of the index:
+    //
+    // - the positing lists for PQ codes are index.codes, which is a
+    //    std::vector < std::vector<uint8_t> >
+    //   if n is the length of posting list #i, codes[i] has length
+    //   bytes_per_code * n
+    //
+    // - the corresponding ids are stored in index.ids
+    //
+    // - given a vector float *x, finding which k centroids are
+    //   closest to it (ie to find the nearest neighbors) can be done with
+    //
+    //   faiss::Index::idx_t *centroid_ids = new faiss::Index::idx_t[k];
+    //   float *distances = new float[k];
+    //   index.quantizer->search (1, x, k, dis, centroids_ids);
+    //
+    faiss::write_index(&index, "/tmp/populated_index.faissindex");
+    { // searching the database
+        int k = 5;
+        printf("[%.3f s] Searching the %d nearest neighbors "
+               "of %ld vectors in the index\n",
+               elapsed() - t0,
+               k,
+               nq);
+        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<float> dis(k * nq);
+        index.search(nq, queries.data(), k, dis.data(), nns.data());
+        printf("[%.3f s] Query results (vector ids, then distances):\n",
+               elapsed() - t0);
+        for (int i = 0; i < nq; i++) {
+            printf("query %2d: ", i);
+            for (int j = 0; j < k; j++) {
+                printf("%7ld ", nns[j + i * k]);
+            }
+            printf("\n     dis: ");
+            for (int j = 0; j < k; j++) {
+                printf("%7g ", dis[j + i * k]);
+            }
+            printf("\n");
+        }
+    }
+    return 0;
+}
--- a/demos/demo_ivfpq_indexing.cpp
+++ b/demos/demo_ivfpq_indexing.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+#include <sys/time.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/index_io.h>
+double elapsed() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec + tv.tv_usec * 1e-6;
+}
+int main() {
+    double t0 = elapsed();
+    // dimension of the vectors to index
+    int d = 128;
+    // size of the database we plan to index
+    size_t nb = 200 * 1000;
+    // make a set of nt training vectors in the unit cube
+    // (could be the database)
+    size_t nt = 100 * 1000;
+    // make the index object and train it
+    faiss::IndexFlatL2 coarse_quantizer(d);
+    // a reasonable number of centroids to index nb vectors
+    int ncentroids = int(4 * sqrt(nb));
+    // the coarse quantizer should not be dealloced before the index
+    // 4 = nb of bytes per code (d must be a multiple of this)
+    // 8 = nb of bits per sub-code (almost always 8)
+    faiss::IndexIVFPQ index(&coarse_quantizer, d, ncentroids, 4, 8);
+    std::mt19937 rng;
+    { // training
+        printf("[%.3f s] Generating %ld vectors in %dD for training\n",
+               elapsed() - t0,
+               nt,
+               d);
+        std::vector<float> trainvecs(nt * d);
+        std::uniform_real_distribution<> distrib;
+        for (size_t i = 0; i < nt * d; i++) {
+            trainvecs[i] = distrib(rng);
+        }
+        printf("[%.3f s] Training the index\n", elapsed() - t0);
+        index.verbose = true;
+        index.train(nt, trainvecs.data());
+    }
+    { // I/O demo
+        const char* outfilename = "/tmp/index_trained.faissindex";
+        printf("[%.3f s] storing the pre-trained index to %s\n",
+               elapsed() - t0,
+               outfilename);
+        write_index(&index, outfilename);
+    }
+    size_t nq;
+    std::vector<float> queries;
+    { // populating the database
+        printf("[%.3f s] Building a dataset of %ld vectors to index\n",
+               elapsed() - t0,
+               nb);
+        std::vector<float> database(nb * d);
+        std::uniform_real_distribution<> distrib;
+        for (size_t i = 0; i < nb * d; i++) {
+            database[i] = distrib(rng);
+        }
+        printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0);
+        index.add(nb, database.data());
+        printf("[%.3f s] imbalance factor: %g\n",
+               elapsed() - t0,
+               index.invlists->imbalance_factor());
+        // remember a few elements from the database as queries
+        int i0 = 1234;
+        int i1 = 1243;
+        nq = i1 - i0;
+        queries.resize(nq * d);
+        for (int i = i0; i < i1; i++) {
+            for (int j = 0; j < d; j++) {
+                queries[(i - i0) * d + j] = database[i * d + j];
+            }
+        }
+    }
+    { // searching the database
+        int k = 5;
+        printf("[%.3f s] Searching the %d nearest neighbors "
+               "of %ld vectors in the index\n",
+               elapsed() - t0,
+               k,
+               nq);
+        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<float> dis(k * nq);
+        index.search(nq, queries.data(), k, dis.data(), nns.data());
+        printf("[%.3f s] Query results (vector ids, then distances):\n",
+               elapsed() - t0);
+        for (int i = 0; i < nq; i++) {
+            printf("query %2d: ", i);
+            for (int j = 0; j < k; j++) {
+                printf("%7ld ", nns[j + i * k]);
+            }
+            printf("\n     dis: ");
+            for (int j = 0; j < k; j++) {
+                printf("%7g ", dis[j + i * k]);
+            }
+            printf("\n");
+        }
+        printf("note that the nearest neighbor is not at "
+               "distance 0 due to quantization errors\n");
+    }
+    return 0;
+}
--- a/demos/demo_nndescent.cpp
+++ b/demos/demo_nndescent.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexNNDescent.h>
+using namespace std::chrono;
+int main(void) {
+    // dimension of the vectors to index
+    int d = 64;
+    int K = 64;
+    // size of the database we plan to index
+    size_t nb = 10000;
+    std::mt19937 rng(12345);
+    // make the index object and train it
+    faiss::IndexNNDescentFlat index(d, K, faiss::METRIC_L2);
+    index.nndescent.S = 10;
+    index.nndescent.R = 32;
+    index.nndescent.L = K;
+    index.nndescent.iter = 10;
+    index.verbose = true;
+    // generate labels by IndexFlat
+    faiss::IndexFlat bruteforce(d, faiss::METRIC_L2);
+    std::vector<float> database(nb * d);
+    for (size_t i = 0; i < nb * d; i++) {
+        database[i] = rng() % 1024;
+    }
+    { // populating the database
+        index.add(nb, database.data());
+        bruteforce.add(nb, database.data());
+    }
+    size_t nq = 1000;
+    { // searching the database
+        printf("Searching ...\n");
+        index.nndescent.search_L = 50;
+        std::vector<float> queries(nq * d);
+        for (size_t i = 0; i < nq * d; i++) {
+            queries[i] = rng() % 1024;
+        }
+        int k = 5;
+        std::vector<faiss::IndexNNDescent::idx_t> nns(k * nq);
+        std::vector<faiss::IndexFlat::idx_t> gt_nns(k * nq);
+        std::vector<float> dis(k * nq);
+        auto start = high_resolution_clock::now();
+        index.search(nq, queries.data(), k, dis.data(), nns.data());
+        auto end = high_resolution_clock::now();
+        // find exact kNNs by brute force search
+        bruteforce.search(nq, queries.data(), k, dis.data(), gt_nns.data());
+        int recalls = 0;
+        for (size_t i = 0; i < nq; ++i) {
+            for (int n = 0; n < k; n++) {
+                for (int m = 0; m < k; m++) {
+                    if (nns[i * k + n] == gt_nns[i * k + m]) {
+                        recalls += 1;
+                    }
+                }
+            }
+        }
+        float recall = 1.0f * recalls / (k * nq);
+        auto t = duration_cast<microseconds>(end - start).count();
+        int qps = nq * 1.0f * 1000 * 1000 / t;
+        printf("Recall@%d: %f, QPS: %d\n", k, recall, qps);
+    }
+}
--- a/demos/demo_ondisk_ivf.py
+++ b/demos/demo_ondisk_ivf.py
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+import numpy as np
+import faiss
+from faiss.contrib.ondisk import merge_ondisk
+#################################################################
+# Small I/O functions
+#################################################################
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+#################################################################
+# Main program
+#################################################################
+stage = int(sys.argv[1])
+tmpdir = '/tmp/'
+if stage == 0:
+    # train the index
+    xt = fvecs_read("sift1M/sift_learn.fvecs")
+    index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
+    print("training index")
+    index.train(xt)
+    print("write " + tmpdir + "trained.index")
+    faiss.write_index(index, tmpdir + "trained.index")
+if 1 <= stage <= 4:
+    # add 1/4 of the database to 4 independent indexes
+    bno = stage - 1
+    xb = fvecs_read("sift1M/sift_base.fvecs")
+    i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
+    index = faiss.read_index(tmpdir + "trained.index")
+    print("adding vectors %d:%d" % (i0, i1))
+    index.add_with_ids(xb[i0:i1], np.arange(i0, i1))
+    print("write " + tmpdir + "block_%d.index" % bno)
+    faiss.write_index(index, tmpdir + "block_%d.index" % bno)
+if stage == 5:
+    print('loading trained index')
+    # construct the output index
+    index = faiss.read_index(tmpdir + "trained.index")
+    block_fnames = [
+        tmpdir + "block_%d.index" % bno
+        for bno in range(4)
+    ]
+    merge_ondisk(index, block_fnames, tmpdir + "merged_index.ivfdata")
+    print("write " + tmpdir + "populated.index")
+    faiss.write_index(index, tmpdir + "populated.index")
+if stage == 6:
+    # perform a search from disk
+    print("read " + tmpdir + "populated.index")
+    index = faiss.read_index(tmpdir + "populated.index")
+    index.nprobe = 16
+    # load query vectors and ground-truth
+    xq = fvecs_read("sift1M/sift_query.fvecs")
+    gt = ivecs_read("sift1M/sift_groundtruth.ivecs")
+    D, I = index.search(xq, 5)
+    recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(xq.shape[0])
+    print("recall@1: %.3f" % recall_at_1)
--- a/demos/demo_sift1M.cpp
+++ b/demos/demo_sift1M.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <faiss/AutoTune.h>
+#include <faiss/index_factory.h>
+/**
+ * To run this demo, please download the ANN_SIFT1M dataset from
+ *
+ *   http://corpus-texmex.irisa.fr/
+ *
+ * and unzip it to the sudirectory sift1M.
+ **/
+/*****************************************************
+ * I/O functions for fvecs and ivecs
+ *****************************************************/
+float* fvecs_read(const char* fname, size_t* d_out, size_t* n_out) {
+    FILE* f = fopen(fname, "r");
+    if (!f) {
+        fprintf(stderr, "could not open %s\n", fname);
+        perror("");
+        abort();
+    }
+    int d;
+    fread(&d, 1, sizeof(int), f);
+    assert((d > 0 && d < 1000000) || !"unreasonable dimension");
+    fseek(f, 0, SEEK_SET);
+    struct stat st;
+    fstat(fileno(f), &st);
+    size_t sz = st.st_size;
+    assert(sz % ((d + 1) * 4) == 0 || !"weird file size");
+    size_t n = sz / ((d + 1) * 4);
+    *d_out = d;
+    *n_out = n;
+    float* x = new float[n * (d + 1)];
+    size_t nr = fread(x, sizeof(float), n * (d + 1), f);
+    assert(nr == n * (d + 1) || !"could not read whole file");
+    // shift array to remove row headers
+    for (size_t i = 0; i < n; i++)
+        memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x));
+    fclose(f);
+    return x;
+}
+// not very clean, but works as long as sizeof(int) == sizeof(float)
+int* ivecs_read(const char* fname, size_t* d_out, size_t* n_out) {
+    return (int*)fvecs_read(fname, d_out, n_out);
+}
+double elapsed() {
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return tv.tv_sec + tv.tv_usec * 1e-6;
+}
+int main() {
+    double t0 = elapsed();
+    // this is typically the fastest one.
+    const char* index_key = "IVF4096,Flat";
+    // these ones have better memory usage
+    // const char *index_key = "Flat";
+    // const char *index_key = "PQ32";
+    // const char *index_key = "PCA80,Flat";
+    // const char *index_key = "IVF4096,PQ8+16";
+    // const char *index_key = "IVF4096,PQ32";
+    // const char *index_key = "IMI2x8,PQ32";
+    // const char *index_key = "IMI2x8,PQ8+16";
+    // const char *index_key = "OPQ16_64,IMI2x8,PQ8+16";
+    faiss::Index* index;
+    size_t d;
+    {
+        printf("[%.3f s] Loading train set\n", elapsed() - t0);
+        size_t nt;
+        float* xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt);
+        printf("[%.3f s] Preparing index \"%s\" d=%ld\n",
+               elapsed() - t0,
+               index_key,
+               d);
+        index = faiss::index_factory(d, index_key);
+        printf("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt);
+        index->train(nt, xt);
+        delete[] xt;
+    }
+    {
+        printf("[%.3f s] Loading database\n", elapsed() - t0);
+        size_t nb, d2;
+        float* xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb);
+        assert(d == d2 || !"dataset does not have same dimension as train set");
+        printf("[%.3f s] Indexing database, size %ld*%ld\n",
+               elapsed() - t0,
+               nb,
+               d);
+        index->add(nb, xb);
+        delete[] xb;
+    }
+    size_t nq;
+    float* xq;
+    {
+        printf("[%.3f s] Loading queries\n", elapsed() - t0);
+        size_t d2;
+        xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq);
+        assert(d == d2 || !"query does not have same dimension as train set");
+    }
+    size_t k;                // nb of results per query in the GT
+    faiss::Index::idx_t* gt; // nq * k matrix of ground-truth nearest-neighbors
+    {
+        printf("[%.3f s] Loading ground truth for %ld queries\n",
+               elapsed() - t0,
+               nq);
+        // load ground-truth and convert int to long
+        size_t nq2;
+        int* gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
+        assert(nq2 == nq || !"incorrect nb of ground truth entries");
+        gt = new faiss::Index::idx_t[k * nq];
+        for (int i = 0; i < k * nq; i++) {
+            gt[i] = gt_int[i];
+        }
+        delete[] gt_int;
+    }
+    // Result of the auto-tuning
+    std::string selected_params;
+    { // run auto-tuning
+        printf("[%.3f s] Preparing auto-tune criterion 1-recall at 1 "
+               "criterion, with k=%ld nq=%ld\n",
+               elapsed() - t0,
+               k,
+               nq);
+        faiss::OneRecallAtRCriterion crit(nq, 1);
+        crit.set_groundtruth(k, nullptr, gt);
+        crit.nnn = k; // by default, the criterion will request only 1 NN
+        printf("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0);
+        faiss::ParameterSpace params;
+        params.initialize(index);
+        printf("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n",
+               elapsed() - t0,
+               params.parameter_ranges.size(),
+               params.n_combinations());
+        faiss::OperatingPoints ops;
+        params.explore(index, nq, xq, crit, &ops);
+        printf("[%.3f s] Found the following operating points: \n",
+               elapsed() - t0);
+        ops.display();
+        // keep the first parameter that obtains > 0.5 1-recall@1
+        for (int i = 0; i < ops.optimal_pts.size(); i++) {
+            if (ops.optimal_pts[i].perf > 0.5) {
+                selected_params = ops.optimal_pts[i].key;
+                break;
+            }
+        }
+        assert(selected_params.size() >= 0 ||
+               !"could not find good enough op point");
+    }
+    { // Use the found configuration to perform a search
+        faiss::ParameterSpace params;
+        printf("[%.3f s] Setting parameter configuration \"%s\" on index\n",
+               elapsed() - t0,
+               selected_params.c_str());
+        params.set_index_parameters(index, selected_params.c_str());
+        printf("[%.3f s] Perform a search on %ld queries\n",
+               elapsed() - t0,
+               nq);
+        // output buffers
+        faiss::Index::idx_t* I = new faiss::Index::idx_t[nq * k];
+        float* D = new float[nq * k];
+        index->search(nq, xq, k, D, I);
+        printf("[%.3f s] Compute recalls\n", elapsed() - t0);
+        // evaluate result by hand.
+        int n_1 = 0, n_10 = 0, n_100 = 0;
+        for (int i = 0; i < nq; i++) {
+            int gt_nn = gt[i * k];
+            for (int j = 0; j < k; j++) {
+                if (I[i * k + j] == gt_nn) {
+                    if (j < 1)
+                        n_1++;
+                    if (j < 10)
+                        n_10++;
+                    if (j < 100)
+                        n_100++;
+                }
+            }
+        }
+        printf("R@1 = %.4f\n", n_1 / float(nq));
+        printf("R@10 = %.4f\n", n_10 / float(nq));
+        printf("R@100 = %.4f\n", n_100 / float(nq));
+        delete[] I;
+        delete[] D;
+    }
+    delete[] xq;
+    delete[] gt;
+    delete index;
+    return 0;
+}
--- a/demos/demo_weighted_kmeans.cpp
+++ b/demos/demo_weighted_kmeans.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <cstdio>
+#include <cstdlib>
+#include <faiss/Clustering.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexHNSW.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/random.h>
+namespace {
+enum WeightedKMeansType {
+    WKMT_FlatL2,
+    WKMT_FlatIP,
+    WKMT_FlatIP_spherical,
+    WKMT_HNSW,
+};
+float weighted_kmeans_clustering(
+        size_t d,
+        size_t n,
+        size_t k,
+        const float* input,
+        const float* weights,
+        float* centroids,
+        WeightedKMeansType index_num) {
+    using namespace faiss;
+    Clustering clus(d, k);
+    clus.verbose = true;
+    std::unique_ptr<Index> index;
+    switch (index_num) {
+        case WKMT_FlatL2:
+            index.reset(new IndexFlatL2(d));
+            break;
+        case WKMT_FlatIP:
+            index.reset(new IndexFlatIP(d));
+            break;
+        case WKMT_FlatIP_spherical:
+            index.reset(new IndexFlatIP(d));
+            clus.spherical = true;
+            break;
+        case WKMT_HNSW:
+            IndexHNSWFlat* ihnsw = new IndexHNSWFlat(d, 32);
+            ihnsw->hnsw.efSearch = 128;
+            index.reset(ihnsw);
+            break;
+    }
+    clus.train(n, input, *index.get(), weights);
+    // on output the index contains the centroids.
+    memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
+    return clus.iteration_stats.back().obj;
+}
+int d = 32;
+float sigma = 0.1;
+#define BIGTEST
+#ifdef BIGTEST
+// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
+int nc = 200000;
+int n_big = 4;
+int n_small = 2;
+#else
+int nc = 5;
+int n_big = 100;
+int n_small = 10;
+#endif
+int n; // number of training points
+void generate_trainset(
+        std::vector<float>& ccent,
+        std::vector<float>& x,
+        std::vector<float>& weights) {
+    // same sampling as test_build_blocks.py test_weighted
+    ccent.resize(d * 2 * nc);
+    faiss::float_randn(ccent.data(), d * 2 * nc, 123);
+    faiss::fvec_renorm_L2(d, 2 * nc, ccent.data());
+    n = nc * n_big + nc * n_small;
+    x.resize(d * n);
+    weights.resize(n);
+    faiss::float_randn(x.data(), x.size(), 1234);
+    float* xi = x.data();
+    float* w = weights.data();
+    for (int ci = 0; ci < nc * 2; ci++) {   // loop over centroids
+        int np = ci < nc ? n_big : n_small; // nb of points around this centroid
+        for (int i = 0; i < np; i++) {
+            for (int j = 0; j < d; j++) {
+                xi[j] = xi[j] * sigma + ccent[ci * d + j];
+            }
+            *w++ = ci < nc ? 0.1 : 10;
+            xi += d;
+        }
+    }
+}
+} // namespace
+int main(int argc, char** argv) {
+    std::vector<float> ccent;
+    std::vector<float> x;
+    std::vector<float> weights;
+    printf("generate training set\n");
+    generate_trainset(ccent, x, weights);
+    std::vector<float> centroids;
+    centroids.resize(nc * d);
+    int the_index_num = -1;
+    int the_with_weights = -1;
+    if (argc == 3) {
+        the_index_num = atoi(argv[1]);
+        the_with_weights = atoi(argv[2]);
+    }
+    for (int index_num = WKMT_FlatL2; index_num <= WKMT_HNSW; index_num++) {
+        if (the_index_num >= 0 && index_num != the_index_num) {
+            continue;
+        }
+        for (int with_weights = 0; with_weights <= 1; with_weights++) {
+            if (the_with_weights >= 0 && with_weights != the_with_weights) {
+                continue;
+            }
+            printf("=================== index_num=%d Run %s weights\n",
+                   index_num,
+                   with_weights ? "with" : "without");
+            weighted_kmeans_clustering(
+                    d,
+                    n,
+                    nc,
+                    x.data(),
+                    with_weights ? weights.data() : nullptr,
+                    centroids.data(),
+                    (WeightedKMeansType)index_num);
+            { // compute distance of points to centroids
+                faiss::IndexFlatL2 cent_index(d);
+                cent_index.add(nc, centroids.data());
+                std::vector<float> dis(n);
+                std::vector<faiss::Index::idx_t> idx(n);
+                cent_index.search(
+                        nc * 2, ccent.data(), 1, dis.data(), idx.data());
+                float dis1 = 0, dis2 = 0;
+                for (int i = 0; i < nc; i++) {
+                    dis1 += dis[i];
+                }
+                printf("average distance of points from big clusters: %g\n",
+                       dis1 / nc);
+                for (int i = 0; i < nc; i++) {
+                    dis2 += dis[i + nc];
+                }
+                printf("average distance of points from small clusters: %g\n",
+                       dis2 / nc);
+            }
+        }
+    }
+    return 0;
+}
--- a/faiss/.flake8
+++ b/faiss/.flake8
+[flake8]
+# Ignore flakes about ambiguous variable name `I`.
+ignore = E741
--- a/faiss/AutoTune.cpp
+++ b/faiss/AutoTune.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+/*
+ * implementation of Hyper-parameter auto-tuning
+ */
+#include <faiss/AutoTune.h>
+#include <cinttypes>
+#include <cmath>
+#include <typeinfo>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexHNSW.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQR.h>
+#include <faiss/IndexLSH.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexRefine.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexBinaryHNSW.h>
+#include <faiss/IndexBinaryIVF.h>
+namespace faiss {
+AutoTuneCriterion::AutoTuneCriterion(idx_t nq, idx_t nnn)
+        : nq(nq), nnn(nnn), gt_nnn(0) {}
+void AutoTuneCriterion::set_groundtruth(
+        int gt_nnn,
+        const float* gt_D_in,
+        const idx_t* gt_I_in) {
+    this->gt_nnn = gt_nnn;
+    if (gt_D_in) { // allow null for this, as it is often not used
+        gt_D.resize(nq * gt_nnn);
+        memcpy(gt_D.data(), gt_D_in, sizeof(gt_D[0]) * nq * gt_nnn);
+    }
+    gt_I.resize(nq * gt_nnn);
+    memcpy(gt_I.data(), gt_I_in, sizeof(gt_I[0]) * nq * gt_nnn);
+}
+OneRecallAtRCriterion::OneRecallAtRCriterion(idx_t nq, idx_t R)
+        : AutoTuneCriterion(nq, R), R(R) {}
+double OneRecallAtRCriterion::evaluate(const float* /*D*/, const idx_t* I)
+        const {
+    FAISS_THROW_IF_NOT_MSG(
+            (gt_I.size() == gt_nnn * nq && gt_nnn >= 1 && nnn >= R),
+            "ground truth not initialized");
+    idx_t n_ok = 0;
+    for (idx_t q = 0; q < nq; q++) {
+        idx_t gt_nn = gt_I[q * gt_nnn];
+        const idx_t* I_line = I + q * nnn;
+        for (int i = 0; i < R; i++) {
+            if (I_line[i] == gt_nn) {
+                n_ok++;
+                break;
+            }
+        }
+    }
+    return n_ok / double(nq);
+}
+IntersectionCriterion::IntersectionCriterion(idx_t nq, idx_t R)
+        : AutoTuneCriterion(nq, R), R(R) {}
+double IntersectionCriterion::evaluate(const float* /*D*/, const idx_t* I)
+        const {
+    FAISS_THROW_IF_NOT_MSG(
+            (gt_I.size() == gt_nnn * nq && gt_nnn >= R && nnn >= R),
+            "ground truth not initialized");
+    int64_t n_ok = 0;
+#pragma omp parallel for reduction(+ : n_ok)
+    for (idx_t q = 0; q < nq; q++) {
+        n_ok += ranklist_intersection_size(
+                R, &gt_I[q * gt_nnn], R, I + q * nnn);
+    }
+    return n_ok / double(nq * R);
+}
+/***************************************************************
+ * OperatingPoints
+ ***************************************************************/
+OperatingPoints::OperatingPoints() {
+    clear();
+}
+void OperatingPoints::clear() {
+    all_pts.clear();
+    optimal_pts.clear();
+    /// default point: doing nothing gives 0 performance and takes 0 time
+    OperatingPoint op = {0, 0, "", -1};
+    optimal_pts.push_back(op);
+}
+/// add a performance measure
+bool OperatingPoints::add(
+        double perf,
+        double t,
+        const std::string& key,
+        size_t cno) {
+    OperatingPoint op = {perf, t, key, int64_t(cno)};
+    all_pts.push_back(op);
+    if (perf == 0) {
+        return false; // no method for 0 accuracy is faster than doing nothing
+    }
+    std::vector<OperatingPoint>& a = optimal_pts;
+    if (perf > a.back().perf) {
+        // keep unconditionally
+        a.push_back(op);
+    } else if (perf == a.back().perf) {
+        if (t < a.back().t) {
+            a.back() = op;
+        } else {
+            return false;
+        }
+    } else {
+        int i;
+        // stricto sensu this should be a bissection
+        for (i = 0; i < a.size(); i++) {
+            if (a[i].perf >= perf)
+                break;
+        }
+        assert(i < a.size());
+        if (t < a[i].t) {
+            if (a[i].perf == perf) {
+                a[i] = op;
+            } else {
+                a.insert(a.begin() + i, op);
+            }
+        } else {
+            return false;
+        }
+    }
+    { // remove non-optimal points from array
+        int i = a.size() - 1;
+        while (i > 0) {
+            if (a[i].t < a[i - 1].t)
+                a.erase(a.begin() + (i - 1));
+            i--;
+        }
+    }
+    return true;
+}
+int OperatingPoints::merge_with(
+        const OperatingPoints& other,
+        const std::string& prefix) {
+    int n_add = 0;
+    for (int i = 0; i < other.all_pts.size(); i++) {
+        const OperatingPoint& op = other.all_pts[i];
+        if (add(op.perf, op.t, prefix + op.key, op.cno))
+            n_add++;
+    }
+    return n_add;
+}
+/// get time required to obtain a given performance measure
+double OperatingPoints::t_for_perf(double perf) const {
+    const std::vector<OperatingPoint>& a = optimal_pts;
+    if (perf > a.back().perf)
+        return 1e50;
+    int i0 = -1, i1 = a.size() - 1;
+    while (i0 + 1 < i1) {
+        int imed = (i0 + i1 + 1) / 2;
+        if (a[imed].perf < perf)
+            i0 = imed;
+        else
+            i1 = imed;
+    }
+    return a[i1].t;
+}
+void OperatingPoints::all_to_gnuplot(const char* fname) const {
+    FILE* f = fopen(fname, "w");
+    if (!f) {
+        fprintf(stderr, "cannot open %s", fname);
+        perror("");
+        abort();
+    }
+    for (int i = 0; i < all_pts.size(); i++) {
+        const OperatingPoint& op = all_pts[i];
+        fprintf(f, "%g %g %s\n", op.perf, op.t, op.key.c_str());
+    }
+    fclose(f);
+}
+void OperatingPoints::optimal_to_gnuplot(const char* fname) const {
+    FILE* f = fopen(fname, "w");
+    if (!f) {
+        fprintf(stderr, "cannot open %s", fname);
+        perror("");
+        abort();
+    }
+    double prev_perf = 0.0;
+    for (int i = 0; i < optimal_pts.size(); i++) {
+        const OperatingPoint& op = optimal_pts[i];
+        fprintf(f, "%g %g\n", prev_perf, op.t);
+        fprintf(f, "%g %g %s\n", op.perf, op.t, op.key.c_str());
+        prev_perf = op.perf;
+    }
+    fclose(f);
+}
+void OperatingPoints::display(bool only_optimal) const {
+    const std::vector<OperatingPoint>& pts =
+            only_optimal ? optimal_pts : all_pts;
+    printf("Tested %zd operating points, %zd ones are Pareto-optimal:\n",
+           all_pts.size(),
+           optimal_pts.size());
+    for (int i = 0; i < pts.size(); i++) {
+        const OperatingPoint& op = pts[i];
+        const char* star = "";
+        if (!only_optimal) {
+            for (int j = 0; j < optimal_pts.size(); j++) {
+                if (op.cno == optimal_pts[j].cno) {
+                    star = "*";
+                    break;
+                }
+            }
+        }
+        printf("cno=%" PRId64 " key=%s perf=%.4f t=%.3f %s\n",
+               op.cno,
+               op.key.c_str(),
+               op.perf,
+               op.t,
+               star);
+    }
+}
+/***************************************************************
+ * ParameterSpace
+ ***************************************************************/
+ParameterSpace::ParameterSpace()
+        : verbose(1),
+          n_experiments(500),
+          batchsize(1 << 30),
+          thread_over_batches(false),
+          min_test_duration(0) {}
+/* not keeping this constructor as inheritors will call the parent
+   initialize()
+ */
+#if 0
+ParameterSpace::ParameterSpace (Index *index):
+    verbose (1), n_experiments (500),
+    batchsize (1<<30), thread_over_batches (false)
+{
+    initialize(index);
+}
+#endif
+size_t ParameterSpace::n_combinations() const {
+    size_t n = 1;
+    for (int i = 0; i < parameter_ranges.size(); i++)
+        n *= parameter_ranges[i].values.size();
+    return n;
+}
+/// get string representation of the combination
+std::string ParameterSpace::combination_name(size_t cno) const {
+    char buf[1000], *wp = buf;
+    *wp = 0;
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        const ParameterRange& pr = parameter_ranges[i];
+        size_t j = cno % pr.values.size();
+        cno /= pr.values.size();
+        wp += snprintf(
+                wp,
+                buf + 1000 - wp,
+                "%s%s=%g",
+                i == 0 ? "" : ",",
+                pr.name.c_str(),
+                pr.values[j]);
+    }
+    return std::string(buf);
+}
+bool ParameterSpace::combination_ge(size_t c1, size_t c2) const {
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        int nval = parameter_ranges[i].values.size();
+        size_t j1 = c1 % nval;
+        size_t j2 = c2 % nval;
+        if (!(j1 >= j2))
+            return false;
+        c1 /= nval;
+        c2 /= nval;
+    }
+    return true;
+}
+#define DC(classname) \
+    const classname* ix = dynamic_cast<const classname*>(index)
+static void init_pq_ParameterRange(
+        const ProductQuantizer& pq,
+        ParameterRange& pr) {
+    if (pq.code_size % 4 == 0) {
+        // Polysemous not supported for code sizes that are not a
+        // multiple of 4
+        for (int i = 2; i <= pq.code_size * 8 / 2; i += 2)
+            pr.values.push_back(i);
+    }
+    pr.values.push_back(pq.code_size * 8);
+}
+ParameterRange& ParameterSpace::add_range(const std::string& name) {
+    for (auto& pr : parameter_ranges) {
+        if (pr.name == name) {
+            return pr;
+        }
+    }
+    parameter_ranges.push_back(ParameterRange());
+    parameter_ranges.back().name = name;
+    return parameter_ranges.back();
+}
+/// initialize with reasonable parameters for this type of index
+void ParameterSpace::initialize(const Index* index) {
+    if (DC(IndexPreTransform)) {
+        index = ix->index;
+    }
+    if (DC(IndexRefine)) {
+        ParameterRange& pr = add_range("k_factor_rf");
+        for (int i = 0; i <= 6; i++) {
+            pr.values.push_back(1 << i);
+        }
+        index = ix->base_index;
+    }
+    if (DC(IndexPreTransform)) {
+        index = ix->index;
+    }
+    if (DC(IndexIVF)) {
+        {
+            ParameterRange& pr = add_range("nprobe");
+            for (int i = 0; i < 13; i++) {
+                size_t nprobe = 1 << i;
+                if (nprobe >= ix->nlist)
+                    break;
+                pr.values.push_back(nprobe);
+            }
+        }
+        ParameterSpace ivf_pspace;
+        ivf_pspace.initialize(ix->quantizer);
+        for (const ParameterRange& p : ivf_pspace.parameter_ranges) {
+            ParameterRange& pr = add_range("quantizer_" + p.name);
+            pr.values = p.values;
+        }
+    }
+    if (DC(IndexPQ)) {
+        ParameterRange& pr = add_range("ht");
+        init_pq_ParameterRange(ix->pq, pr);
+    }
+    if (DC(IndexIVFPQ)) {
+        ParameterRange& pr = add_range("ht");
+        init_pq_ParameterRange(ix->pq, pr);
+    }
+    if (DC(IndexIVF)) {
+        const MultiIndexQuantizer* miq =
+                dynamic_cast<const MultiIndexQuantizer*>(ix->quantizer);
+        if (miq) {
+            ParameterRange& pr_max_codes = add_range("max_codes");
+            for (int i = 8; i < 20; i++) {
+                pr_max_codes.values.push_back(1 << i);
+            }
+            pr_max_codes.values.push_back(
+                    std::numeric_limits<double>::infinity());
+        }
+    }
+    if (DC(IndexIVFPQR)) {
+        ParameterRange& pr = add_range("k_factor");
+        for (int i = 0; i <= 6; i++) {
+            pr.values.push_back(1 << i);
+        }
+    }
+    if (dynamic_cast<const IndexHNSW*>(index)) {
+        ParameterRange& pr = add_range("efSearch");
+        for (int i = 2; i <= 9; i++) {
+            pr.values.push_back(1 << i);
+        }
+    }
+}
+#undef DC
+// non-const version
+#define DC(classname) classname* ix = dynamic_cast<classname*>(index)
+/// set a combination of parameters on an index
+void ParameterSpace::set_index_parameters(Index* index, size_t cno) const {
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        const ParameterRange& pr = parameter_ranges[i];
+        size_t j = cno % pr.values.size();
+        cno /= pr.values.size();
+        double val = pr.values[j];
+        set_index_parameter(index, pr.name, val);
+    }
+}
+/// set a combination of parameters on an index
+void ParameterSpace::set_index_parameters(
+        Index* index,
+        const char* description_in) const {
+    std::string description(description_in);
+    char* ptr;
+    for (char* tok = strtok_r(&description[0], " ,", &ptr); tok;
+         tok = strtok_r(nullptr, " ,", &ptr)) {
+        char name[100];
+        double val;
+        int ret = sscanf(tok, "%99[^=]=%lf", name, &val);
+        FAISS_THROW_IF_NOT_FMT(
+                ret == 2, "could not interpret parameters %s", tok);
+        set_index_parameter(index, name, val);
+    }
+}
+void ParameterSpace::set_index_parameter(
+        Index* index,
+        const std::string& name,
+        double val) const {
+    if (verbose > 1) {
+        printf("    set_index_parameter %s=%g\n", name.c_str(), val);
+    }
+    if (name == "verbose") {
+        index->verbose = int(val);
+        // and fall through to also enable it on sub-indexes
+    }
+    if (DC(IndexIDMap)) {
+        set_index_parameter(ix->index, name, val);
+        return;
+    }
+    if (DC(IndexPreTransform)) {
+        set_index_parameter(ix->index, name, val);
+        return;
+    }
+    if (DC(ThreadedIndex<Index>)) {
+        // call on all sub-indexes
+        auto fn = [this, name, val](int /* no */, Index* subIndex) {
+            set_index_parameter(subIndex, name, val);
+        };
+        ix->runOnIndex(fn);
+        return;
+    }
+    if (DC(IndexRefine)) {
+        if (name == "k_factor_rf") {
+            ix->k_factor = int(val);
+            return;
+        }
+        // otherwise it is for the sub-index
+        set_index_parameter(ix->base_index, name, val);
+        return;
+    }
+    if (name == "verbose") {
+        index->verbose = int(val);
+        return; // last verbose that we could find
+    }
+    if (name == "nprobe") {
+        if (DC(IndexIVF)) {
+            ix->nprobe = int(val);
+            return;
+        }
+    }
+    if (name == "ht") {
+        if (DC(IndexPQ)) {
+            if (val >= ix->pq.code_size * 8) {
+                ix->search_type = IndexPQ::ST_PQ;
+            } else {
+                ix->search_type = IndexPQ::ST_polysemous;
+                ix->polysemous_ht = int(val);
+            }
+            return;
+        } else if (DC(IndexIVFPQ)) {
+            if (val >= ix->pq.code_size * 8) {
+                ix->polysemous_ht = 0;
+            } else {
+                ix->polysemous_ht = int(val);
+            }
+            return;
+        }
+    }
+    if (name == "k_factor") {
+        if (DC(IndexIVFPQR)) {
+            ix->k_factor = val;
+            return;
+        }
+    }
+    if (name == "max_codes") {
+        if (DC(IndexIVF)) {
+            ix->max_codes = std::isfinite(val) ? size_t(val) : 0;
+            return;
+        }
+    }
+    if (name == "efSearch") {
+        if (DC(IndexHNSW)) {
+            ix->hnsw.efSearch = int(val);
+            return;
+        }
+        if (DC(IndexIVF)) {
+            if (IndexHNSW* cq = dynamic_cast<IndexHNSW*>(ix->quantizer)) {
+                cq->hnsw.efSearch = int(val);
+                return;
+            }
+        }
+    }
+    if (name.find("quantizer_") == 0) {
+        if (DC(IndexIVF)) {
+            std::string sub_name = name.substr(strlen("quantizer_"));
+            set_index_parameter(ix->quantizer, sub_name, val);
+            return;
+        }
+    }
+    FAISS_THROW_FMT(
+            "ParameterSpace::set_index_parameter:"
+            "could not set parameter %s",
+            name.c_str());
+}
+void ParameterSpace::display() const {
+    printf("ParameterSpace, %zd parameters, %zd combinations:\n",
+           parameter_ranges.size(),
+           n_combinations());
+    for (int i = 0; i < parameter_ranges.size(); i++) {
+        const ParameterRange& pr = parameter_ranges[i];
+        printf("   %s: ", pr.name.c_str());
+        char sep = '[';
+        for (int j = 0; j < pr.values.size(); j++) {
+            printf("%c %g", sep, pr.values[j]);
+            sep = ',';
+        }
+        printf("]\n");
+    }
+}
+void ParameterSpace::update_bounds(
+        size_t cno,
+        const OperatingPoint& op,
+        double* upper_bound_perf,
+        double* lower_bound_t) const {
+    if (combination_ge(cno, op.cno)) {
+        if (op.t > *lower_bound_t)
+            *lower_bound_t = op.t;
+    }
+    if (combination_ge(op.cno, cno)) {
+        if (op.perf < *upper_bound_perf)
+            *upper_bound_perf = op.perf;
+    }
+}
+void ParameterSpace::explore(
+        Index* index,
+        size_t nq,
+        const float* xq,
+        const AutoTuneCriterion& crit,
+        OperatingPoints* ops) const {
+    FAISS_THROW_IF_NOT_MSG(
+            nq == crit.nq, "criterion does not have the same nb of queries");
+    size_t n_comb = n_combinations();
+    if (n_experiments == 0) {
+        for (size_t cno = 0; cno < n_comb; cno++) {
+            set_index_parameters(index, cno);
+            std::vector<Index::idx_t> I(nq * crit.nnn);
+            std::vector<float> D(nq * crit.nnn);
+            double t0 = getmillisecs();
+            index->search(nq, xq, crit.nnn, D.data(), I.data());
+            double t_search = (getmillisecs() - t0) / 1e3;
+            double perf = crit.evaluate(D.data(), I.data());
+            bool keep = ops->add(perf, t_search, combination_name(cno), cno);
+            if (verbose)
+                printf("  %zd/%zd: %s perf=%.3f t=%.3f s %s\n",
+                       cno,
+                       n_comb,
+                       combination_name(cno).c_str(),
+                       perf,
+                       t_search,
+                       keep ? "*" : "");
+        }
+        return;
+    }
+    int n_exp = n_experiments;
+    if (n_exp > n_comb)
+        n_exp = n_comb;
+    FAISS_THROW_IF_NOT(n_comb == 1 || n_exp > 2);
+    std::vector<int> perm(n_comb);
+    // make sure the slowest and fastest experiment are run
+    perm[0] = 0;
+    if (n_comb > 1) {
+        perm[1] = n_comb - 1;
+        rand_perm(&perm[2], n_comb - 2, 1234);
+        for (int i = 2; i < perm.size(); i++)
+            perm[i]++;
+    }
+    for (size_t xp = 0; xp < n_exp; xp++) {
+        size_t cno = perm[xp];
+        if (verbose)
+            printf("  %zd/%d: cno=%zd %s ",
+                   xp,
+                   n_exp,
+                   cno,
+                   combination_name(cno).c_str());
+        {
+            double lower_bound_t = 0.0;
+            double upper_bound_perf = 1.0;
+            for (int i = 0; i < ops->all_pts.size(); i++) {
+                update_bounds(
+                        cno,
+                        ops->all_pts[i],
+                        &upper_bound_perf,
+                        &lower_bound_t);
+            }
+            double best_t = ops->t_for_perf(upper_bound_perf);
+            if (verbose)
+                printf("bounds [perf<=%.3f t>=%.3f] %s",
+                       upper_bound_perf,
+                       lower_bound_t,
+                       best_t <= lower_bound_t ? "skip\n" : "");
+            if (best_t <= lower_bound_t)
+                continue;
+        }
+        set_index_parameters(index, cno);
+        std::vector<Index::idx_t> I(nq * crit.nnn);
+        std::vector<float> D(nq * crit.nnn);
+        double t0 = getmillisecs();
+        int nrun = 0;
+        double t_search;
+        do {
+            if (thread_over_batches) {
+#pragma omp parallel for
+                for (Index::idx_t q0 = 0; q0 < nq; q0 += batchsize) {
+                    size_t q1 = q0 + batchsize;
+                    if (q1 > nq)
+                        q1 = nq;
+                    index->search(
+                            q1 - q0,
+                            xq + q0 * index->d,
+                            crit.nnn,
+                            D.data() + q0 * crit.nnn,
+                            I.data() + q0 * crit.nnn);
+                }
+            } else {
+                for (size_t q0 = 0; q0 < nq; q0 += batchsize) {
+                    size_t q1 = q0 + batchsize;
+                    if (q1 > nq)
+                        q1 = nq;
+                    index->search(
+                            q1 - q0,
+                            xq + q0 * index->d,
+                            crit.nnn,
+                            D.data() + q0 * crit.nnn,
+                            I.data() + q0 * crit.nnn);
+                }
+            }
+            nrun++;
+            t_search = (getmillisecs() - t0) / 1e3;
+        } while (t_search < min_test_duration);
+        t_search /= nrun;
+        double perf = crit.evaluate(D.data(), I.data());
+        bool keep = ops->add(perf, t_search, combination_name(cno), cno);
+        if (verbose)
+            printf(" perf %.3f t %.3f (%d %s) %s\n",
+                   perf,
+                   t_search,
+                   nrun,
+                   nrun >= 2 ? "runs" : "run",
+                   keep ? "*" : "");
+    }
+}
+} // namespace faiss
--- a/faiss/AutoTune.h
+++ b/faiss/AutoTune.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_AUTO_TUNE_H
+#define FAISS_AUTO_TUNE_H
+#include <stdint.h>
+#include <unordered_map>
+#include <vector>
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+namespace faiss {
+/**
+ * Evaluation criterion. Returns a performance measure in [0,1],
+ * higher is better.
+ */
+struct AutoTuneCriterion {
+    typedef Index::idx_t idx_t;
+    idx_t nq;     ///< nb of queries this criterion is evaluated on
+    idx_t nnn;    ///< nb of NNs that the query should request
+    idx_t gt_nnn; ///< nb of GT NNs required to evaluate criterion
+    std::vector<float> gt_D; ///< Ground-truth distances (size nq * gt_nnn)
+    std::vector<idx_t> gt_I; ///< Ground-truth indexes (size nq * gt_nnn)
+    AutoTuneCriterion(idx_t nq, idx_t nnn);
+    /** Intitializes the gt_D and gt_I vectors. Must be called before evaluating
+     *
+     * @param gt_D_in  size nq * gt_nnn
+     * @param gt_I_in  size nq * gt_nnn
+     */
+    void set_groundtruth(
+            int gt_nnn,
+            const float* gt_D_in,
+            const idx_t* gt_I_in);
+    /** Evaluate the criterion.
+     *
+     * @param D  size nq * nnn
+     * @param I  size nq * nnn
+     * @return the criterion, between 0 and 1. Larger is better.
+     */
+    virtual double evaluate(const float* D, const idx_t* I) const = 0;
+    virtual ~AutoTuneCriterion() {}
+};
+struct OneRecallAtRCriterion : AutoTuneCriterion {
+    idx_t R;
+    OneRecallAtRCriterion(idx_t nq, idx_t R);
+    double evaluate(const float* D, const idx_t* I) const override;
+    ~OneRecallAtRCriterion() override {}
+};
+struct IntersectionCriterion : AutoTuneCriterion {
+    idx_t R;
+    IntersectionCriterion(idx_t nq, idx_t R);
+    double evaluate(const float* D, const idx_t* I) const override;
+    ~IntersectionCriterion() override {}
+};
+/**
+ * Maintains a list of experimental results. Each operating point is a
+ * (perf, t, key) triplet, where higher perf and lower t is
+ * better. The key field is an arbitrary identifier for the operating point.
+ *
+ * Includes primitives to extract the Pareto-optimal operating points in the
+ * (perf, t) space.
+ */
+struct OperatingPoint {
+    double perf;     ///< performance measure (output of a Criterion)
+    double t;        ///< corresponding execution time (ms)
+    std::string key; ///< key that identifies this op pt
+    int64_t cno;     ///< integer identifer
+};
+struct OperatingPoints {
+    /// all operating points
+    std::vector<OperatingPoint> all_pts;
+    /// optimal operating points, sorted by perf
+    std::vector<OperatingPoint> optimal_pts;
+    // begins with a single operating point: t=0, perf=0
+    OperatingPoints();
+    /// add operating points from other to this, with a prefix to the keys
+    int merge_with(
+            const OperatingPoints& other,
+            const std::string& prefix = "");
+    void clear();
+    /// add a performance measure. Return whether it is an optimal point
+    bool add(double perf, double t, const std::string& key, size_t cno = 0);
+    /// get time required to obtain a given performance measure
+    double t_for_perf(double perf) const;
+    /// easy-to-read output
+    void display(bool only_optimal = true) const;
+    /// output to a format easy to digest by gnuplot
+    void all_to_gnuplot(const char* fname) const;
+    void optimal_to_gnuplot(const char* fname) const;
+};
+/// possible values of a parameter, sorted from least to most expensive/accurate
+struct ParameterRange {
+    std::string name;
+    std::vector<double> values;
+};
+/** Uses a-priori knowledge on the Faiss indexes to extract tunable parameters.
+ */
+struct ParameterSpace {
+    /// all tunable parameters
+    std::vector<ParameterRange> parameter_ranges;
+    // exploration parameters
+    /// verbosity during exploration
+    int verbose;
+    /// nb of experiments during optimization (0 = try all combinations)
+    int n_experiments;
+    /// maximum number of queries to submit at a time.
+    size_t batchsize;
+    /// use multithreading over batches (useful to benchmark
+    /// independent single-searches)
+    bool thread_over_batches;
+    /// run tests several times until they reach at least this
+    /// duration (to avoid jittering in MT mode)
+    double min_test_duration;
+    ParameterSpace();
+    /// nb of combinations, = product of values sizes
+    size_t n_combinations() const;
+    /// returns whether combinations c1 >= c2 in the tuple sense
+    bool combination_ge(size_t c1, size_t c2) const;
+    /// get string representation of the combination
+    std::string combination_name(size_t cno) const;
+    /// print a description on stdout
+    void display() const;
+    /// add a new parameter (or return it if it exists)
+    ParameterRange& add_range(const std::string& name);
+    /// initialize with reasonable parameters for the index
+    virtual void initialize(const Index* index);
+    /// set a combination of parameters on an index
+    void set_index_parameters(Index* index, size_t cno) const;
+    /// set a combination of parameters described by a string
+    void set_index_parameters(Index* index, const char* param_string) const;
+    /// set one of the parameters, returns whether setting was successful
+    virtual void set_index_parameter(
+            Index* index,
+            const std::string& name,
+            double val) const;
+    /** find an upper bound on the performance and a lower bound on t
+     * for configuration cno given another operating point op */
+    void update_bounds(
+            size_t cno,
+            const OperatingPoint& op,
+            double* upper_bound_perf,
+            double* lower_bound_t) const;
+    /** explore operating points
+     * @param index   index to run on
+     * @param xq      query vectors (size nq * index.d)
+     * @param crit    selection criterion
+     * @param ops     resulting operating points
+     */
+    void explore(
+            Index* index,
+            size_t nq,
+            const float* xq,
+            const AutoTuneCriterion& crit,
+            OperatingPoints* ops) const;
+    virtual ~ParameterSpace() {}
+};
+} // namespace faiss
+#endif
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set(FAISS_SRC
+  AutoTune.cpp
+  Clustering.cpp
+  IVFlib.cpp
+  Index.cpp
+  Index2Layer.cpp
+  IndexAdditiveQuantizer.cpp
+  IndexBinary.cpp
+  IndexBinaryFlat.cpp
+  IndexBinaryFromFloat.cpp
+  IndexBinaryHNSW.cpp
+  IndexBinaryHash.cpp
+  IndexBinaryIVF.cpp
+  IndexFlat.cpp
+  IndexFlatCodes.cpp
+  IndexHNSW.cpp
+  IndexIVF.cpp
+  IndexIVFAdditiveQuantizer.cpp
+  IndexIVFFlat.cpp
+  IndexIVFPQ.cpp
+  IndexIVFPQFastScan.cpp
+  IndexIVFPQR.cpp
+  IndexIVFSpectralHash.cpp
+  IndexLSH.cpp
+  IndexNNDescent.cpp
+  IndexLattice.cpp
+  IndexNSG.cpp
+  IndexPQ.cpp
+  IndexPQFastScan.cpp
+  IndexPreTransform.cpp
+  IndexRefine.cpp
+  IndexReplicas.cpp
+  IndexScalarQuantizer.cpp
+  IndexShards.cpp
+  MatrixStats.cpp
+  MetaIndexes.cpp
+  VectorTransform.cpp
+  clone_index.cpp
+  index_factory.cpp
+  impl/AuxIndexStructures.cpp
+  impl/FaissException.cpp
+  impl/HNSW.cpp
+  impl/NSG.cpp
+  impl/PolysemousTraining.cpp
+  impl/ProductQuantizer.cpp
+  impl/AdditiveQuantizer.cpp
+  impl/ResidualQuantizer.cpp
+  impl/LocalSearchQuantizer.cpp
+  impl/ScalarQuantizer.cpp
+  impl/index_read.cpp
+  impl/index_write.cpp
+  impl/io.cpp
+  impl/kmeans1d.cpp
+  impl/lattice_Zn.cpp
+  impl/pq4_fast_scan.cpp
+  impl/pq4_fast_scan_search_1.cpp
+  impl/pq4_fast_scan_search_qbs.cpp
+  impl/io.cpp
+  impl/lattice_Zn.cpp
+  impl/NNDescent.cpp
+  invlists/BlockInvertedLists.cpp
+  invlists/DirectMap.cpp
+  invlists/InvertedLists.cpp
+  invlists/InvertedListsIOHook.cpp
+  utils/Heap.cpp
+  utils/WorkerThread.cpp
+  utils/distances.cpp
+  utils/distances_simd.cpp
+  utils/extra_distances.cpp
+  utils/hamming.cpp
+  utils/partitioning.cpp
+  utils/quantize_lut.cpp
+  utils/random.cpp
+  utils/utils.cpp
+)
+set(FAISS_HEADERS
+  AutoTune.h
+  Clustering.h
+  IVFlib.h
+  Index.h
+  Index2Layer.h
+  IndexAdditiveQuantizer.h
+  IndexBinary.h
+  IndexBinaryFlat.h
+  IndexBinaryFromFloat.h
+  IndexBinaryHNSW.h
+  IndexBinaryHash.h
+  IndexBinaryIVF.h
+  IndexFlat.h
+  IndexFlatCodes.h
+  IndexHNSW.h
+  IndexIVF.h
+  IndexIVFAdditiveQuantizer.h
+  IndexIVFFlat.h
+  IndexIVFPQ.h
+  IndexIVFPQFastScan.h
+  IndexIVFPQR.h
+  IndexIVFSpectralHash.h
+  IndexLSH.h
+  IndexLattice.h
+  IndexNNDescent.h
+  IndexNSG.h
+  IndexPQ.h
+  IndexPQFastScan.h
+  IndexPreTransform.h
+  IndexRefine.h
+  IndexReplicas.h
+  IndexAdditiveQuantizer.h
+  IndexScalarQuantizer.h
+  IndexShards.h
+  MatrixStats.h
+  MetaIndexes.h
+  MetricType.h
+  VectorTransform.h
+  clone_index.h
+  index_factory.h
+  index_io.h
+  impl/AdditiveQuantizer.h
+  impl/AuxIndexStructures.h
+  impl/FaissAssert.h
+  impl/FaissException.h
+  impl/HNSW.h
+  impl/LocalSearchQuantizer.h
+  impl/NNDescent.h
+  impl/NSG.h
+  impl/PolysemousTraining.h
+  impl/ProductQuantizer-inl.h
+  impl/ProductQuantizer.h
+  impl/ResidualQuantizer.h
+  impl/ResultHandler.h
+  impl/ScalarQuantizer.h
+  impl/ThreadedIndex-inl.h
+  impl/ThreadedIndex.h
+  impl/io.h
+  impl/io_macros.h
+  impl/kmeans1d.h
+  impl/lattice_Zn.h
+  impl/platform_macros.h
+  impl/pq4_fast_scan.h
+  impl/simd_result_handlers.h
+  invlists/BlockInvertedLists.h
+  invlists/DirectMap.h
+  invlists/InvertedLists.h
+  invlists/InvertedListsIOHook.h
+  utils/AlignedTable.h
+  utils/Heap.h
+  utils/WorkerThread.h
+  utils/distances.h
+  utils/extra_distances-inl.h
+  utils/extra_distances.h
+  utils/hamming-inl.h
+  utils/hamming.h
+  utils/ordered_key_value.h
+  utils/partitioning.h
+  utils/quantize_lut.h
+  utils/random.h
+  utils/simdlib.h
+  utils/simdlib_avx2.h
+  utils/simdlib_emulated.h
+  utils/simdlib_neon.h
+  utils/utils.h
+)
+if(NOT WIN32)
+  list(APPEND FAISS_SRC invlists/OnDiskInvertedLists.cpp)
+  list(APPEND FAISS_HEADERS invlists/OnDiskInvertedLists.h)
+endif()
+# Export FAISS_HEADERS variable to parent scope.
+set(FAISS_HEADERS ${FAISS_HEADERS} PARENT_SCOPE)
+add_library(faiss ${FAISS_SRC})
+add_library(faiss_avx2 ${FAISS_SRC})
+if(NOT FAISS_OPT_LEVEL STREQUAL "avx2")
+  set_target_properties(faiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
+endif()
+if(NOT WIN32)
+  target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)
+else()
+  # MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT
+  # Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
+  # Ref. F16C (2nd paragraph): https://walbourn.github.io/directxmath-avx2/
+  # Ref. POPCNT: https://docs.microsoft.com/en-us/cpp/intrinsics/popcnt16-popcnt-popcnt64
+  target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
+endif()
+# Handle `#include <faiss/foo.h>`.
+target_include_directories(faiss PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
+# Handle `#include <faiss/foo.h>`.
+target_include_directories(faiss_avx2 PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
+set_target_properties(faiss PROPERTIES
+  POSITION_INDEPENDENT_CODE ON
+  WINDOWS_EXPORT_ALL_SYMBOLS ON
+)
+set_target_properties(faiss_avx2 PROPERTIES
+  POSITION_INDEPENDENT_CODE ON
+  WINDOWS_EXPORT_ALL_SYMBOLS ON
+)
+if(WIN32)
+  target_compile_definitions(faiss PRIVATE FAISS_MAIN_LIB)
+  target_compile_definitions(faiss_avx2 PRIVATE FAISS_MAIN_LIB)
+endif()
+target_compile_definitions(faiss PRIVATE FINTEGER=int)
+target_compile_definitions(faiss_avx2 PRIVATE FINTEGER=int)
+find_package(OpenMP REQUIRED)
+target_link_libraries(faiss PRIVATE OpenMP::OpenMP_CXX)
+target_link_libraries(faiss_avx2 PRIVATE OpenMP::OpenMP_CXX)
+find_package(MKL)
+if(MKL_FOUND)
+  target_link_libraries(faiss PRIVATE ${MKL_LIBRARIES})
+  target_link_libraries(faiss_avx2 PRIVATE ${MKL_LIBRARIES})
+else()
+  find_package(BLAS REQUIRED)
+  target_link_libraries(faiss PRIVATE ${BLAS_LIBRARIES})
+  target_link_libraries(faiss_avx2 PRIVATE ${BLAS_LIBRARIES})
+  find_package(LAPACK REQUIRED)
+  target_link_libraries(faiss PRIVATE ${LAPACK_LIBRARIES})
+  target_link_libraries(faiss_avx2 PRIVATE ${LAPACK_LIBRARIES})
+endif()
+install(TARGETS faiss
+  EXPORT faiss-targets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+if(FAISS_OPT_LEVEL STREQUAL "avx2")
+  install(TARGETS faiss_avx2
+    EXPORT faiss-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+foreach(header ${FAISS_HEADERS})
+  get_filename_component(dir ${header} DIRECTORY )
+  install(FILES ${header}
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/${dir}
+  )
+endforeach()
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+  "${PROJECT_BINARY_DIR}/cmake/faiss-config-version.cmake"
+  VERSION ${CMAKE_PROJECT_VERSION}
+  COMPATIBILITY AnyNewerVersion
+)
+configure_file(${PROJECT_SOURCE_DIR}/cmake/faiss-config.cmake.in
+  ${PROJECT_BINARY_DIR}/cmake/faiss-config.cmake
+  COPYONLY
+)
+install(FILES ${PROJECT_BINARY_DIR}/cmake/faiss-config.cmake
+  ${PROJECT_BINARY_DIR}/cmake/faiss-config-version.cmake
+  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
+)
+install(EXPORT faiss-targets
+  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/faiss
+)
--- a/faiss/Clustering.cpp
+++ b/faiss/Clustering.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/Clustering.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <omp.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/kmeans1d.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+namespace faiss {
+ClusteringParameters::ClusteringParameters()
+        : niter(25),
+          nredo(1),
+          verbose(false),
+          spherical(false),
+          int_centroids(false),
+          update_index(false),
+          frozen_centroids(false),
+          min_points_per_centroid(39),
+          max_points_per_centroid(256),
+          seed(1234),
+          decode_block_size(32768) {}
+// 39 corresponds to 10000 / 256 -> to avoid warnings on PQ tests with randu10k
+Clustering::Clustering(int d, int k) : d(d), k(k) {}
+Clustering::Clustering(int d, int k, const ClusteringParameters& cp)
+        : ClusteringParameters(cp), d(d), k(k) {}
+static double imbalance_factor(int n, int k, int64_t* assign) {
+    std::vector<int> hist(k, 0);
+    for (int i = 0; i < n; i++)
+        hist[assign[i]]++;
+    double tot = 0, uf = 0;
+    for (int i = 0; i < k; i++) {
+        tot += hist[i];
+        uf += hist[i] * (double)hist[i];
+    }
+    uf = uf * k / (tot * tot);
+    return uf;
+}
+void Clustering::post_process_centroids() {
+    if (spherical) {
+        fvec_renorm_L2(d, k, centroids.data());
+    }
+    if (int_centroids) {
+        for (size_t i = 0; i < centroids.size(); i++)
+            centroids[i] = roundf(centroids[i]);
+    }
+}
+void Clustering::train(
+        idx_t nx,
+        const float* x_in,
+        Index& index,
+        const float* weights) {
+    train_encoded(
+            nx,
+            reinterpret_cast<const uint8_t*>(x_in),
+            nullptr,
+            index,
+            weights);
+}
+namespace {
+using idx_t = Clustering::idx_t;
+idx_t subsample_training_set(
+        const Clustering& clus,
+        idx_t nx,
+        const uint8_t* x,
+        size_t line_size,
+        const float* weights,
+        uint8_t** x_out,
+        float** weights_out) {
+    if (clus.verbose) {
+        printf("Sampling a subset of %zd / %" PRId64 " for training\n",
+               clus.k * clus.max_points_per_centroid,
+               nx);
+    }
+    std::vector<int> perm(nx);
+    rand_perm(perm.data(), nx, clus.seed);
+    nx = clus.k * clus.max_points_per_centroid;
+    uint8_t* x_new = new uint8_t[nx * line_size];
+    *x_out = x_new;
+    for (idx_t i = 0; i < nx; i++) {
+        memcpy(x_new + i * line_size, x + perm[i] * line_size, line_size);
+    }
+    if (weights) {
+        float* weights_new = new float[nx];
+        for (idx_t i = 0; i < nx; i++) {
+            weights_new[i] = weights[perm[i]];
+        }
+        *weights_out = weights_new;
+    } else {
+        *weights_out = nullptr;
+    }
+    return nx;
+}
+/** compute centroids as (weighted) sum of training points
+ *
+ * @param x            training vectors, size n * code_size (from codec)
+ * @param codec        how to decode the vectors (if NULL then cast to float*)
+ * @param weights      per-training vector weight, size n (or NULL)
+ * @param assign       nearest centroid for each training vector, size n
+ * @param k_frozen     do not update the k_frozen first centroids
+ * @param centroids    centroid vectors (output only), size k * d
+ * @param hassign      histogram of assignments per centroid (size k),
+ *                     should be 0 on input
+ *
+ */
+void compute_centroids(
+        size_t d,
+        size_t k,
+        size_t n,
+        size_t k_frozen,
+        const uint8_t* x,
+        const Index* codec,
+        const int64_t* assign,
+        const float* weights,
+        float* hassign,
+        float* centroids) {
+    k -= k_frozen;
+    centroids += k_frozen * d;
+    memset(centroids, 0, sizeof(*centroids) * d * k);
+    size_t line_size = codec ? codec->sa_code_size() : d * sizeof(float);
+#pragma omp parallel
+    {
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+        // this thread is taking care of centroids c0:c1
+        size_t c0 = (k * rank) / nt;
+        size_t c1 = (k * (rank + 1)) / nt;
+        std::vector<float> decode_buffer(d);
+        for (size_t i = 0; i < n; i++) {
+            int64_t ci = assign[i];
+            assert(ci >= 0 && ci < k + k_frozen);
+            ci -= k_frozen;
+            if (ci >= c0 && ci < c1) {
+                float* c = centroids + ci * d;
+                const float* xi;
+                if (!codec) {
+                    xi = reinterpret_cast<const float*>(x + i * line_size);
+                } else {
+                    float* xif = decode_buffer.data();
+                    codec->sa_decode(1, x + i * line_size, xif);
+                    xi = xif;
+                }
+                if (weights) {
+                    float w = weights[i];
+                    hassign[ci] += w;
+                    for (size_t j = 0; j < d; j++) {
+                        c[j] += xi[j] * w;
+                    }
+                } else {
+                    hassign[ci] += 1.0;
+                    for (size_t j = 0; j < d; j++) {
+                        c[j] += xi[j];
+                    }
+                }
+            }
+        }
+    }
+#pragma omp parallel for
+    for (idx_t ci = 0; ci < k; ci++) {
+        if (hassign[ci] == 0) {
+            continue;
+        }
+        float norm = 1 / hassign[ci];
+        float* c = centroids + ci * d;
+        for (size_t j = 0; j < d; j++) {
+            c[j] *= norm;
+        }
+    }
+}
+// a bit above machine epsilon for float16
+#define EPS (1 / 1024.)
+/** Handle empty clusters by splitting larger ones.
+ *
+ * It works by slightly changing the centroids to make 2 clusters from
+ * a single one. Takes the same arguments as compute_centroids.
+ *
+ * @return           nb of spliting operations (larger is worse)
+ */
+int split_clusters(
+        size_t d,
+        size_t k,
+        size_t n,
+        size_t k_frozen,
+        float* hassign,
+        float* centroids) {
+    k -= k_frozen;
+    centroids += k_frozen * d;
+    /* Take care of void clusters */
+    size_t nsplit = 0;
+    RandomGenerator rng(1234);
+    for (size_t ci = 0; ci < k; ci++) {
+        if (hassign[ci] == 0) { /* need to redefine a centroid */
+            size_t cj;
+            for (cj = 0; 1; cj = (cj + 1) % k) {
+                /* probability to pick this cluster for split */
+                float p = (hassign[cj] - 1.0) / (float)(n - k);
+                float r = rng.rand_float();
+                if (r < p) {
+                    break; /* found our cluster to be split */
+                }
+            }
+            memcpy(centroids + ci * d,
+                   centroids + cj * d,
+                   sizeof(*centroids) * d);
+            /* small symmetric pertubation */
+            for (size_t j = 0; j < d; j++) {
+                if (j % 2 == 0) {
+                    centroids[ci * d + j] *= 1 + EPS;
+                    centroids[cj * d + j] *= 1 - EPS;
+                } else {
+                    centroids[ci * d + j] *= 1 - EPS;
+                    centroids[cj * d + j] *= 1 + EPS;
+                }
+            }
+            /* assume even split of the cluster */
+            hassign[ci] = hassign[cj] / 2;
+            hassign[cj] -= hassign[ci];
+            nsplit++;
+        }
+    }
+    return nsplit;
+}
+}; // namespace
+void Clustering::train_encoded(
+        idx_t nx,
+        const uint8_t* x_in,
+        const Index* codec,
+        Index& index,
+        const float* weights) {
+    FAISS_THROW_IF_NOT_FMT(
+            nx >= k,
+            "Number of training points (%" PRId64
+            ") should be at least "
+            "as large as number of clusters (%zd)",
+            nx,
+            k);
+    FAISS_THROW_IF_NOT_FMT(
+            (!codec || codec->d == d),
+            "Codec dimension %d not the same as data dimension %d",
+            int(codec->d),
+            int(d));
+    FAISS_THROW_IF_NOT_FMT(
+            index.d == d,
+            "Index dimension %d not the same as data dimension %d",
+            int(index.d),
+            int(d));
+    double t0 = getmillisecs();
+    if (!codec) {
+        // Check for NaNs in input data. Normally it is the user's
+        // responsibility, but it may spare us some hard-to-debug
+        // reports.
+        const float* x = reinterpret_cast<const float*>(x_in);
+        for (size_t i = 0; i < nx * d; i++) {
+            FAISS_THROW_IF_NOT_MSG(
+                    std::isfinite(x[i]), "input contains NaN's or Inf's");
+        }
+    }
+    const uint8_t* x = x_in;
+    std::unique_ptr<uint8_t[]> del1;
+    std::unique_ptr<float[]> del3;
+    size_t line_size = codec ? codec->sa_code_size() : sizeof(float) * d;
+    if (nx > k * max_points_per_centroid) {
+        uint8_t* x_new;
+        float* weights_new;
+        nx = subsample_training_set(
+                *this, nx, x, line_size, weights, &x_new, &weights_new);
+        del1.reset(x_new);
+        x = x_new;
+        del3.reset(weights_new);
+        weights = weights_new;
+    } else if (nx < k * min_points_per_centroid) {
+        fprintf(stderr,
+                "WARNING clustering %" PRId64
+                " points to %zd centroids: "
+                "please provide at least %" PRId64 " training points\n",
+                nx,
+                k,
+                idx_t(k) * min_points_per_centroid);
+    }
+    if (nx == k) {
+        // this is a corner case, just copy training set to clusters
+        if (verbose) {
+            printf("Number of training points (%" PRId64
+                   ") same as number of "
+                   "clusters, just copying\n",
+                   nx);
+        }
+        centroids.resize(d * k);
+        if (!codec) {
+            memcpy(centroids.data(), x_in, sizeof(float) * d * k);
+        } else {
+            codec->sa_decode(nx, x_in, centroids.data());
+        }
+        // one fake iteration...
+        ClusteringIterationStats stats = {0.0, 0.0, 0.0, 1.0, 0};
+        iteration_stats.push_back(stats);
+        index.reset();
+        index.add(k, centroids.data());
+        return;
+    }
+    if (verbose) {
+        printf("Clustering %" PRId64
+               " points in %zdD to %zd clusters, "
+               "redo %d times, %d iterations\n",
+               nx,
+               d,
+               k,
+               nredo,
+               niter);
+        if (codec) {
+            printf("Input data encoded in %zd bytes per vector\n",
+                   codec->sa_code_size());
+        }
+    }
+    std::unique_ptr<idx_t[]> assign(new idx_t[nx]);
+    std::unique_ptr<float[]> dis(new float[nx]);
+    // remember best iteration for redo
+    bool lower_is_better = index.metric_type != METRIC_INNER_PRODUCT;
+    float best_obj = lower_is_better ? HUGE_VALF : -HUGE_VALF;
+    std::vector<ClusteringIterationStats> best_iteration_stats;
+    std::vector<float> best_centroids;
+    // support input centroids
+    FAISS_THROW_IF_NOT_MSG(
+            centroids.size() % d == 0,
+            "size of provided input centroids not a multiple of dimension");
+    size_t n_input_centroids = centroids.size() / d;
+    if (verbose && n_input_centroids > 0) {
+        printf("  Using %zd centroids provided as input (%sfrozen)\n",
+               n_input_centroids,
+               frozen_centroids ? "" : "not ");
+    }
+    double t_search_tot = 0;
+    if (verbose) {
+        printf("  Preprocessing in %.2f s\n", (getmillisecs() - t0) / 1000.);
+    }
+    t0 = getmillisecs();
+    // temporary buffer to decode vectors during the optimization
+    std::vector<float> decode_buffer(codec ? d * decode_block_size : 0);
+    for (int redo = 0; redo < nredo; redo++) {
+        if (verbose && nredo > 1) {
+            printf("Outer iteration %d / %d\n", redo, nredo);
+        }
+        // initialize (remaining) centroids with random points from the dataset
+        centroids.resize(d * k);
+        std::vector<int> perm(nx);
+        rand_perm(perm.data(), nx, seed + 1 + redo * 15486557L);
+        if (!codec) {
+            for (int i = n_input_centroids; i < k; i++) {
+                memcpy(&centroids[i * d], x + perm[i] * line_size, line_size);
+            }
+        } else {
+            for (int i = n_input_centroids; i < k; i++) {
+                codec->sa_decode(1, x + perm[i] * line_size, &centroids[i * d]);
+            }
+        }
+        post_process_centroids();
+        // prepare the index
+        if (index.ntotal != 0) {
+            index.reset();
+        }
+        if (!index.is_trained) {
+            index.train(k, centroids.data());
+        }
+        index.add(k, centroids.data());
+        // k-means iterations
+        float obj = 0;
+        for (int i = 0; i < niter; i++) {
+            double t0s = getmillisecs();
+            if (!codec) {
+                index.search(
+                        nx,
+                        reinterpret_cast<const float*>(x),
+                        1,
+                        dis.get(),
+                        assign.get());
+            } else {
+                // search by blocks of decode_block_size vectors
+                size_t code_size = codec->sa_code_size();
+                for (size_t i0 = 0; i0 < nx; i0 += decode_block_size) {
+                    size_t i1 = i0 + decode_block_size;
+                    if (i1 > nx) {
+                        i1 = nx;
+                    }
+                    codec->sa_decode(
+                            i1 - i0, x + code_size * i0, decode_buffer.data());
+                    index.search(
+                            i1 - i0,
+                            decode_buffer.data(),
+                            1,
+                            dis.get() + i0,
+                            assign.get() + i0);
+                }
+            }
+            InterruptCallback::check();
+            t_search_tot += getmillisecs() - t0s;
+            // accumulate objective
+            obj = 0;
+            for (int j = 0; j < nx; j++) {
+                obj += dis[j];
+            }
+            // update the centroids
+            std::vector<float> hassign(k);
+            size_t k_frozen = frozen_centroids ? n_input_centroids : 0;
+            compute_centroids(
+                    d,
+                    k,
+                    nx,
+                    k_frozen,
+                    x,
+                    codec,
+                    assign.get(),
+                    weights,
+                    hassign.data(),
+                    centroids.data());
+            int nsplit = split_clusters(
+                    d, k, nx, k_frozen, hassign.data(), centroids.data());
+            // collect statistics
+            ClusteringIterationStats stats = {
+                    obj,
+                    (getmillisecs() - t0) / 1000.0,
+                    t_search_tot / 1000,
+                    imbalance_factor(nx, k, assign.get()),
+                    nsplit};
+            iteration_stats.push_back(stats);
+            if (verbose) {
+                printf("  Iteration %d (%.2f s, search %.2f s): "
+                       "objective=%g imbalance=%.3f nsplit=%d       \r",
+                       i,
+                       stats.time,
+                       stats.time_search,
+                       stats.obj,
+                       stats.imbalance_factor,
+                       nsplit);
+                fflush(stdout);
+            }
+            post_process_centroids();
+            // add centroids to index for the next iteration (or for output)
+            index.reset();
+            if (update_index) {
+                index.train(k, centroids.data());
+            }
+            index.add(k, centroids.data());
+            InterruptCallback::check();
+        }
+        if (verbose)
+            printf("\n");
+        if (nredo > 1) {
+            if ((lower_is_better && obj < best_obj) ||
+                (!lower_is_better && obj > best_obj)) {
+                if (verbose) {
+                    printf("Objective improved: keep new clusters\n");
+                }
+                best_centroids = centroids;
+                best_iteration_stats = iteration_stats;
+                best_obj = obj;
+            }
+            index.reset();
+        }
+    }
+    if (nredo > 1) {
+        centroids = best_centroids;
+        iteration_stats = best_iteration_stats;
+        index.reset();
+        index.add(k, best_centroids.data());
+    }
+}
+Clustering1D::Clustering1D(int k) : Clustering(1, k) {}
+Clustering1D::Clustering1D(int k, const ClusteringParameters& cp)
+        : Clustering(1, k, cp) {}
+void Clustering1D::train_exact(idx_t n, const float* x) {
+    const float* xt = x;
+    std::unique_ptr<uint8_t[]> del;
+    if (n > k * max_points_per_centroid) {
+        uint8_t* x_new;
+        float* weights_new;
+        n = subsample_training_set(
+                *this,
+                n,
+                (uint8_t*)x,
+                sizeof(float) * d,
+                nullptr,
+                &x_new,
+                &weights_new);
+        del.reset(x_new);
+        xt = (float*)x_new;
+    }
+    centroids.resize(k);
+    double uf = kmeans1d(xt, n, k, centroids.data());
+    ClusteringIterationStats stats = {0.0, 0.0, 0.0, uf, 0};
+    iteration_stats.push_back(stats);
+}
+float kmeans_clustering(
+        size_t d,
+        size_t n,
+        size_t k,
+        const float* x,
+        float* centroids) {
+    Clustering clus(d, k);
+    clus.verbose = d * n * k > (1L << 30);
+    // display logs if > 1Gflop per iteration
+    IndexFlatL2 index(d);
+    clus.train(n, x, index);
+    memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
+    return clus.iteration_stats.back().obj;
+}
+/******************************************************************************
+ * ProgressiveDimClustering implementation
+ ******************************************************************************/
+ProgressiveDimClusteringParameters::ProgressiveDimClusteringParameters() {
+    progressive_dim_steps = 10;
+    apply_pca = true; // seems a good idea to do this by default
+    niter = 10;       // reduce nb of iterations per step
+}
+Index* ProgressiveDimIndexFactory::operator()(int dim) {
+    return new IndexFlatL2(dim);
+}
+ProgressiveDimClustering::ProgressiveDimClustering(int d, int k) : d(d), k(k) {}
+ProgressiveDimClustering::ProgressiveDimClustering(
+        int d,
+        int k,
+        const ProgressiveDimClusteringParameters& cp)
+        : ProgressiveDimClusteringParameters(cp), d(d), k(k) {}
+namespace {
+using idx_t = Index::idx_t;
+void copy_columns(idx_t n, idx_t d1, const float* src, idx_t d2, float* dest) {
+    idx_t d = std::min(d1, d2);
+    for (idx_t i = 0; i < n; i++) {
+        memcpy(dest, src, sizeof(float) * d);
+        src += d1;
+        dest += d2;
+    }
+}
+}; // namespace
+void ProgressiveDimClustering::train(
+        idx_t n,
+        const float* x,
+        ProgressiveDimIndexFactory& factory) {
+    int d_prev = 0;
+    PCAMatrix pca(d, d);
+    std::vector<float> xbuf;
+    if (apply_pca) {
+        if (verbose) {
+            printf("Training PCA transform\n");
+        }
+        pca.train(n, x);
+        if (verbose) {
+            printf("Apply PCA\n");
+        }
+        xbuf.resize(n * d);
+        pca.apply_noalloc(n, x, xbuf.data());
+        x = xbuf.data();
+    }
+    for (int iter = 0; iter < progressive_dim_steps; iter++) {
+        int di = int(pow(d, (1. + iter) / progressive_dim_steps));
+        if (verbose) {
+            printf("Progressive dim step %d: cluster in dimension %d\n",
+                   iter,
+                   di);
+        }
+        std::unique_ptr<Index> clustering_index(factory(di));
+        Clustering clus(di, k, *this);
+        if (d_prev > 0) {
+            // copy warm-start centroids (padded with 0s)
+            clus.centroids.resize(k * di);
+            copy_columns(
+                    k, d_prev, centroids.data(), di, clus.centroids.data());
+        }
+        std::vector<float> xsub(n * di);
+        copy_columns(n, d, x, di, xsub.data());
+        clus.train(n, xsub.data(), *clustering_index.get());
+        centroids = clus.centroids;
+        iteration_stats.insert(
+                iteration_stats.end(),
+                clus.iteration_stats.begin(),
+                clus.iteration_stats.end());
+        d_prev = di;
+    }
+    if (apply_pca) {
+        if (verbose) {
+            printf("Revert PCA transform on centroids\n");
+        }
+        std::vector<float> cent_transformed(d * k);
+        pca.reverse_transform(k, centroids.data(), cent_transformed.data());
+        cent_transformed.swap(centroids);
+    }
+}
+} // namespace faiss
--- a/faiss/Clustering.h
+++ b/faiss/Clustering.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_CLUSTERING_H
+#define FAISS_CLUSTERING_H
+#include <faiss/Index.h>
+#include <vector>
+namespace faiss {
+/** Class for the clustering parameters. Can be passed to the
+ * constructor of the Clustering object.
+ */
+struct ClusteringParameters {
+    int niter; ///< clustering iterations
+    int nredo; ///< redo clustering this many times and keep best
+    bool verbose;
+    bool spherical;        ///< do we want normalized centroids?
+    bool int_centroids;    ///< round centroids coordinates to integer
+    bool update_index;     ///< re-train index after each iteration?
+    bool frozen_centroids; ///< use the centroids provided as input and do not
+                           ///< change them during iterations
+    int min_points_per_centroid; ///< otherwise you get a warning
+    int max_points_per_centroid; ///< to limit size of dataset
+    int seed; ///< seed for the random number generator
+    size_t decode_block_size; ///< how many vectors at a time to decode
+    /// sets reasonable defaults
+    ClusteringParameters();
+};
+struct ClusteringIterationStats {
+    float obj;   ///< objective values (sum of distances reported by index)
+    double time; ///< seconds for iteration
+    double time_search;      ///< seconds for just search
+    double imbalance_factor; ///< imbalance factor of iteration
+    int nsplit;              ///< number of cluster splits
+};
+/** K-means clustering based on assignment - centroid update iterations
+ *
+ * The clustering is based on an Index object that assigns training
+ * points to the centroids. Therefore, at each iteration the centroids
+ * are added to the index.
+ *
+ * On output, the centoids table is set to the latest version
+ * of the centroids and they are also added to the index. If the
+ * centroids table it is not empty on input, it is also used for
+ * initialization.
+ *
+ */
+struct Clustering : ClusteringParameters {
+    typedef Index::idx_t idx_t;
+    size_t d; ///< dimension of the vectors
+    size_t k; ///< nb of centroids
+    /** centroids (k * d)
+     * if centroids are set on input to train, they will be used as
+     * initialization
+     */
+    std::vector<float> centroids;
+    /// stats at every iteration of clustering
+    std::vector<ClusteringIterationStats> iteration_stats;
+    Clustering(int d, int k);
+    Clustering(int d, int k, const ClusteringParameters& cp);
+    /** run k-means training
+     *
+     * @param x          training vectors, size n * d
+     * @param index      index used for assignment
+     * @param x_weights  weight associated to each vector: NULL or size n
+     */
+    virtual void train(
+            idx_t n,
+            const float* x,
+            faiss::Index& index,
+            const float* x_weights = nullptr);
+    /** run with encoded vectors
+     *
+     * win addition to train()'s parameters takes a codec as parameter
+     * to decode the input vectors.
+     *
+     * @param codec      codec used to decode the vectors (nullptr =
+     *                   vectors are in fact floats)     *
+     */
+    void train_encoded(
+            idx_t nx,
+            const uint8_t* x_in,
+            const Index* codec,
+            Index& index,
+            const float* weights = nullptr);
+    /// Post-process the centroids after each centroid update.
+    /// includes optional L2 normalization and nearest integer rounding
+    void post_process_centroids();
+    virtual ~Clustering() {}
+};
+/** Exact 1D clustering algorithm
+ *
+ * Since it does not use an index, it does not overload the train() function
+ */
+struct Clustering1D : Clustering {
+    explicit Clustering1D(int k);
+    Clustering1D(int k, const ClusteringParameters& cp);
+    void train_exact(idx_t n, const float* x);
+    virtual ~Clustering1D() {}
+};
+struct ProgressiveDimClusteringParameters : ClusteringParameters {
+    int progressive_dim_steps; ///< number of incremental steps
+    bool apply_pca;            ///< apply PCA on input
+    ProgressiveDimClusteringParameters();
+};
+/** generates an index suitable for clustering when called */
+struct ProgressiveDimIndexFactory {
+    /// ownership transferred to caller
+    virtual Index* operator()(int dim);
+    virtual ~ProgressiveDimIndexFactory() {}
+};
+/** K-means clustering with progressive dimensions used
+ *
+ * The clustering first happens in dim 1, then with exponentially increasing
+ * dimension until d (I steps). This is typically applied after a PCA
+ * transformation (optional). Reference:
+ *
+ * "Improved Residual Vector Quantization for High-dimensional Approximate
+ * Nearest Neighbor Search"
+ *
+ * Shicong Liu, Hongtao Lu, Junru Shao, AAAI'15
+ *
+ * https://arxiv.org/abs/1509.05195
+ */
+struct ProgressiveDimClustering : ProgressiveDimClusteringParameters {
+    using idx_t = Index::idx_t;
+    size_t d; ///< dimension of the vectors
+    size_t k; ///< nb of centroids
+    /** centroids (k * d) */
+    std::vector<float> centroids;
+    /// stats at every iteration of clustering
+    std::vector<ClusteringIterationStats> iteration_stats;
+    ProgressiveDimClustering(int d, int k);
+    ProgressiveDimClustering(
+            int d,
+            int k,
+            const ProgressiveDimClusteringParameters& cp);
+    void train(idx_t n, const float* x, ProgressiveDimIndexFactory& factory);
+    virtual ~ProgressiveDimClustering() {}
+};
+/** simplified interface
+ *
+ * @param d dimension of the data
+ * @param n nb of training vectors
+ * @param k nb of output centroids
+ * @param x training set (size n * d)
+ * @param centroids output centroids (size k * d)
+ * @return final quantization error
+ */
+float kmeans_clustering(
+        size_t d,
+        size_t n,
+        size_t k,
+        const float* x,
+        float* centroids);
+} // namespace faiss
+#endif