init the faiss for rocm

395d2ce6 · huchen · 5ded39f5 · 395d2ce6 · 395d2ce6 · 395d2ce6
Commit 395d2ce6 authored Jun 01, 2022 by huchen
20 changed files
--- a/benchs/distributed_ondisk/make_index_vslice.py
+++ b/benchs/distributed_ondisk/make_index_vslice.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import time
+import numpy as np
+import faiss
+import argparse
+from multiprocessing.dummy import Pool as ThreadPool
+
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+
+
+def produce_batches(args):
+
+    x = fvecs_mmap(args.input)
+
+    if args.i1 == -1:
+        args.i1 = len(x)
+
+    print("Iterating on vectors %d:%d from %s by batches of size %d" % (
+        args.i0, args.i1, args.input, args.bs))
+
+    for j0 in range(args.i0, args.i1, args.bs):
+        j1 = min(j0 + args.bs, args.i1)
+        yield np.arange(j0, j1), x[j0:j1]
+
+
+def rate_limited_iter(l):
+    'a thread pre-processes the next element'
+    pool = ThreadPool(1)
+    res = None
+
+    def next_or_None():
+        try:
+            return next(l)
+        except StopIteration:
+            return None
+
+    while True:
+        res_next = pool.apply_async(next_or_None)
+        if res is not None:
+            res = res.get()
+            if res is None:
+                return
+            yield res
+        res = res_next
+
+deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
+workdir = "/checkpoint/matthijs/ondisk_distributed/"
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='make index for a subset of the data')
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('index type')
+    aa('--inputindex',
+       default=workdir + 'trained.faissindex',
+       help='empty input index to fill in')
+    aa('--nt', default=-1, type=int, help='nb of openmp threads to use')
+
+    group = parser.add_argument_group('db options')
+    aa('--input', default=deep1bdir + "base.fvecs")
+    aa('--bs', default=2**18, type=int,
+       help='batch size for db access')
+    aa('--i0', default=0, type=int, help='lower bound to index')
+    aa('--i1', default=-1, type=int, help='upper bound of vectors to index')
+
+    group = parser.add_argument_group('output')
+    aa('-o', default='/tmp/x', help='output index')
+    aa('--keepquantizer', default=False, action='store_true',
+       help='by default we remove the data from the quantizer to save space')
+
+    args = parser.parse_args()
+    print('args=', args)
+
+    print('start accessing data')
+    src = produce_batches(args)
+
+    print('loading index', args.inputindex)
+    index = faiss.read_index(args.inputindex)
+
+    if args.nt != -1:
+        faiss.omp_set_num_threads(args.nt)
+
+    t0 = time.time()
+    ntot = 0
+    for ids, x in rate_limited_iter(src):
+        print('add %d:%d (%.3f s)' % (ntot, ntot + ids.size, time.time() - t0))
+        index.add_with_ids(np.ascontiguousarray(x, dtype='float32'), ids)
+        ntot += ids.size
+
+    index_ivf = faiss.extract_index_ivf(index)
+    print('invlists stats: imbalance %.3f' % index_ivf.invlists.imbalance_factor())
+    index_ivf.invlists.print_stats()
+
+    if not args.keepquantizer:
+        print('resetting quantizer content')
+        index_ivf = faiss.extract_index_ivf(index)
+        index_ivf.quantizer.reset()
+
+    print('store output', args.o)
+    faiss.write_index(index, args.o)
+
+if __name__ == '__main__':
+    main()
--- a/benchs/distributed_ondisk/make_trained_index.py
+++ b/benchs/distributed_ondisk/make_trained_index.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import faiss
+
+deep1bdir = "/datasets01_101/simsearch/041218/deep1b/"
+workdir = "/checkpoint/matthijs/ondisk_distributed/"
+
+
+print('Load centroids')
+centroids = np.load(workdir + '1M_centroids.npy')
+ncent, d = centroids.shape
+
+
+print('apply random rotation')
+rrot = faiss.RandomRotationMatrix(d, d)
+rrot.init(1234)
+centroids = rrot.apply_py(centroids)
+
+print('make HNSW index as quantizer')
+quantizer = faiss.IndexHNSWFlat(d, 32)
+quantizer.hnsw.efSearch = 1024
+quantizer.hnsw.efConstruction = 200
+quantizer.add(centroids)
+
+print('build index')
+index = faiss.IndexPreTransform(
+    rrot,
+    faiss.IndexIVFScalarQuantizer(
+        quantizer, d, ncent, faiss.ScalarQuantizer.QT_6bit
+        )
+    )
+
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+
+
+print('finish training index')
+xt = fvecs_mmap(deep1bdir + 'learn.fvecs')
+xt = np.ascontiguousarray(xt[:256 * 1000], dtype='float32')
+index.train(xt)
+
+print('write output')
+faiss.write_index(index, workdir + 'trained.faissindex')
--- a/benchs/distributed_ondisk/merge_to_ondisk.py
+++ b/benchs/distributed_ondisk/merge_to_ondisk.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import faiss
+import argparse
+from multiprocessing.dummy import Pool as ThreadPool
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputs', nargs='*', required=True,
+                        help='input indexes to merge')
+    parser.add_argument('--l0', type=int, default=0)
+    parser.add_argument('--l1', type=int, default=-1)
+
+    parser.add_argument('--nt', default=-1,
+                        help='nb threads')
+
+    parser.add_argument('--output', required=True,
+                        help='output index filename')
+    parser.add_argument('--outputIL',
+                        help='output invfile filename')
+
+    args = parser.parse_args()
+
+    if args.nt != -1:
+        print('set nb of threads to', args.nt)
+
+
+    ils = faiss.InvertedListsPtrVector()
+    ils_dont_dealloc = []
+
+    pool = ThreadPool(20)
+
+    def load_index(fname):
+        print("loading", fname)
+        try:
+            index = faiss.read_index(fname, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
+        except RuntimeError as e:
+            print('could not load %s: %s' % (fname, e))
+            return fname, None
+
+        print("  %d entries" % index.ntotal)
+        return fname, index
+
+    index0 = None
+
+    for _, index in pool.imap(load_index, args.inputs):
+        if index is None:
+            continue
+        index_ivf = faiss.extract_index_ivf(index)
+        il = faiss.downcast_InvertedLists(index_ivf.invlists)
+        index_ivf.invlists = None
+        il.this.own()
+        ils_dont_dealloc.append(il)
+        if (args.l0, args.l1) != (0, -1):
+            print('restricting to lists %d:%d' % (args.l0, args.l1))
+            # il = faiss.SliceInvertedLists(il, args.l0, args.l1)
+
+            il.crop_invlists(args.l0, args.l1)
+            ils_dont_dealloc.append(il)
+        ils.push_back(il)
+
+        if index0 is None:
+            index0 = index
+
+    print("loaded %d invlists" % ils.size())
+
+    if not args.outputIL:
+        args.outputIL = args.output + '_invlists'
+
+    il0 = ils.at(0)
+
+    il = faiss.OnDiskInvertedLists(
+        il0.nlist, il0.code_size,
+        args.outputIL)
+
+    print("perform merge")
+
+    ntotal = il.merge_from(ils.data(), ils.size(), True)
+
+    print("swap into index0")
+
+    index0_ivf = faiss.extract_index_ivf(index0)
+    index0_ivf.nlist = il0.nlist
+    index0_ivf.ntotal = index0.ntotal = ntotal
+    index0_ivf.invlists = il
+    index0_ivf.own_invlists = False
+
+    print("write", args.output)
+
+    faiss.write_index(index0, args.output)
--- a/benchs/distributed_ondisk/rpc.py
+++ b/benchs/distributed_ondisk/rpc.py
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Simplistic RPC implementation.
+Exposes all functions of a Server object.
+
+Uses pickle for serialization and the socket interface.
+"""
+
+import os,pdb,pickle,time,errno,sys,_thread,traceback,socket,threading,gc
+
+
+# default
+PORT=12032
+
+
+#########################################################################
+# simple I/O functions
+
+
+
+def inline_send_handle(f, conn):
+    st = os.fstat(f.fileno())
+    size = st.st_size
+    pickle.dump(size, conn)
+    conn.write(f.read(size))
+
+def inline_send_string(s, conn):
+    size = len(s)
+    pickle.dump(size, conn)
+    conn.write(s)
+
+
+class FileSock:
+    " wraps a socket so that it is usable by pickle/cPickle "
+
+    def __init__(self,sock):
+        self.sock = sock
+        self.nr=0
+
+    def write(self, buf):
+        # print("sending %d bytes"%len(buf))
+        #self.sock.sendall(buf)
+        # print("...done")
+        bs = 512 * 1024
+        ns = 0
+        while ns < len(buf):
+            sent = self.sock.send(buf[ns:ns + bs])
+            ns += sent
+
+
+    def read(self,bs=512*1024):
+        #if self.nr==10000: pdb.set_trace()
+        self.nr+=1
+        # print("read bs=%d"%bs)
+        b = []
+        nb = 0
+        while len(b)<bs:
+            # print('   loop')
+            rb = self.sock.recv(bs - nb)
+            if not rb: break
+            b.append(rb)
+            nb += len(rb)
+        return b''.join(b)
+
+    def readline(self):
+        # print("readline!")
+        """may be optimized..."""
+        s=bytes()
+        while True:
+            c=self.read(1)
+            s+=c
+        if len(c)==0 or chr(c[0])=='\n':
+            return s
+
+class ClientExit(Exception):
+    pass
+
+class ServerException(Exception):
+    pass
+
+
+class Server:
+    """
+    server protocol. Methods from classes that subclass Server can be called
+    transparently from a client
+    """
+
+    def __init__(self, s, logf=sys.stderr, log_prefix=''):
+        self.logf = logf
+        self.log_prefix = log_prefix
+
+        # connection
+
+        self.conn = s
+        self.fs = FileSock(s)
+
+
+    def log(self, s):
+        self.logf.write("Sever log %s: %s\n" % (self.log_prefix, s))
+
+    def one_function(self):
+        """
+        Executes a single function with associated I/O.
+        Protocol:
+        - the arguments and results are serialized with the pickle protocol
+        - client sends : (fname,args)
+            fname = method name to call
+            args = tuple of arguments
+        - server sends result: (rid,st,ret)
+            rid = request id
+            st = None, or exception if there was during execution
+            ret = return value or None if st!=None
+        """
+
+        try:
+            (fname,args)=pickle.load(self.fs)
+        except EOFError:
+            raise ClientExit("read args")
+        self.log("executing method %s"%(fname))
+        st = None
+        ret = None
+        try:
+            f=getattr(self,fname)
+        except AttributeError:
+            st = AttributeError("unknown method "+fname)
+            self.log("unknown method ")
+
+        try:
+            ret = f(*args)
+        except Exception as e:
+            # due to a bug (in mod_python?), ServerException cannot be
+            # unpickled, so send the string and make the exception on the client side
+
+            #st=ServerException(
+            #  "".join(traceback.format_tb(sys.exc_info()[2]))+
+            #  str(e))
+            st="".join(traceback.format_tb(sys.exc_info()[2]))+str(e)
+            self.log("exception in method")
+            traceback.print_exc(50,self.logf)
+            self.logf.flush()
+
+        print("return")
+        try:
+            pickle.dump((st ,ret), self.fs, protocol=4)
+        except EOFError:
+            raise ClientExit("function return")
+
+    def exec_loop(self):
+        """ main execution loop. Loops and handles exit states"""
+
+        self.log("in exec_loop")
+        try:
+            while True:
+                self.one_function()
+        except ClientExit as e:
+            self.log("ClientExit %s"%e)
+        except socket.error as e:
+            self.log("socket error %s"%e)
+            traceback.print_exc(50,self.logf)
+        except EOFError:
+            self.log("EOF during communication")
+            traceback.print_exc(50,self.logf)
+        except BaseException:
+            # unexpected
+            traceback.print_exc(50,sys.stderr)
+            sys.exit(1)
+
+        print("exit sever")
+
+    def exec_loop_cleanup(self):
+        pass
+
+    ###################################################################
+    # spying stuff
+
+    def get_ps_stats(self):
+        ret=''
+        f=os.popen("echo ============ `hostname` uptime:; uptime;"+
+                   "echo ============ self:; "+
+                   "ps -p %d -o pid,vsize,rss,%%cpu,nlwp,psr; "%os.getpid()+
+                   "echo ============ run queue:;"+
+                   "ps ar -o user,pid,%cpu,%mem,ni,nlwp,psr,vsz,rss,cputime,command")
+        for l in f:
+            ret+=l
+        return ret
+
+class Client:
+    """
+    Methods of the server object can be called transparently. Exceptions are
+    re-raised.
+    """
+    def __init__(self, HOST, port=PORT, v6=False):
+        socktype = socket.AF_INET6 if v6 else socket.AF_INET
+
+        sock = socket.socket(socktype, socket.SOCK_STREAM)
+        print("connecting",HOST, port, socktype)
+        sock.connect((HOST, port))
+        self.sock = sock
+        self.fs = FileSock(sock)
+
+    def generic_fun(self, fname, args):
+        # int "gen fun",fname
+        pickle.dump((fname, args), self.fs, protocol=4)
+        return self.get_result()
+
+    def get_result(self):
+        (st, ret) = pickle.load(self.fs)
+        if st!=None:
+            raise ServerException(st)
+        else:
+            return ret
+
+    def __getattr__(self,name):
+        return lambda *x: self.generic_fun(name,x)
+
+
+def run_server(new_handler, port=PORT, report_to_file=None, v6=False):
+
+    HOST = ''                 # Symbolic name meaning the local host
+    socktype = socket.AF_INET6 if v6 else socket.AF_INET
+    s = socket.socket(socktype, socket.SOCK_STREAM)
+    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+
+    print("bind %s:%d" % (HOST, port))
+    s.bind((HOST, port))
+    s.listen(5)
+
+    print("accepting connections")
+    if report_to_file is not None:
+        print('storing host+port in', report_to_file)
+        open(report_to_file, 'w').write('%s:%d ' % (socket.gethostname(), port))
+
+    while True:
+        try:
+            conn, addr = s.accept()
+        except socket.error as e:
+            if e[1]=='Interrupted system call': continue
+            raise
+
+        print('Connected by', addr, end=' ')
+
+        ibs = new_handler(conn)
+
+        tid = _thread.start_new_thread(ibs.exec_loop,())
+
+        print("tid",tid)
--- a/benchs/distributed_ondisk/run_on_cluster.bash
+++ b/benchs/distributed_ondisk/run_on_cluster.bash
+#! /bin/bash
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+todo=$1
+# other options can be transmitted
+shift
+
+# the training data of the Deep1B dataset
+deep1bdir=/datasets01_101/simsearch/041218/deep1b
+traindata=$deep1bdir/learn.fvecs
+
+# this is for small tests
+nvec=1000000
+k=4000
+
+# for the real run
+# nvec=50000000
+# k=1000000
+
+# working directory for the real run
+workdir=/checkpoint/matthijs/ondisk_distributed
+mkdir -p $workdir/{vslices,hslices}
+
+if [ -z "$todo" ]; then
+    echo "nothing to do"
+    exit 1
+elif [ $todo == test_kmeans_0 ]; then
+    # non distributed baseline
+    python distributed_kmeans.py \
+           --indata $traindata --i1 $nvec \
+           --k $k
+
+elif [ $todo == test_kmeans_1 ]; then
+    # using all the machine's GPUs
+    python distributed_kmeans.py \
+           --indata $traindata --i1 $nvec \
+           --k $k --gpu -1
+
+elif [ $todo == test_kmeans_2 ]; then
+    # distrbuted run, with one local server per GPU
+    ngpu=$( echo /dev/nvidia? | wc -w )
+    baseport=12012
+
+    # kill background porcesses on output of this script
+    trap 'kill -HUP 0' 0
+
+    hostports=''
+
+    for((gpu=0;gpu<ngpu;gpu++)); do
+        # range of vectors to assign to each sever
+        i0=$((nvec * gpu / ngpu))
+        i1=$((nvec * (gpu + 1) / ngpu))
+        port=$(( baseport + gpu ))
+
+        echo "start server $gpu for range $i0:$i1"
+
+        python distributed_kmeans.py \
+               --indata $traindata \
+               --i0 $i0 --i1 $i1 \
+               --server --gpu $gpu \
+               --port $port --ipv4 &
+
+        hostports="$hostports localhost:$port"
+    done
+
+    # lame way of making sure all servers are running
+    sleep 5s
+
+    python distributed_kmeans.py \
+           --client --servers "$hostports" \
+           --k $k --ipv4
+
+elif [ $todo == slurm_distributed_kmeans ]; then
+
+    nserv=5
+
+    srun -n$nserv \
+         --time=48:00:00 \
+         --cpus-per-task=40 --gres=gpu:4 --mem=100G \
+         --partition=priority --comment='priority is the only one that works'  \
+         -l bash $( realpath $0 ) slurm_within_kmeans_server
+
+elif [ $todo == slurm_within_kmeans_server ]; then
+
+   nserv=$SLURM_NPROCS
+   [ ! -z "$nserv" ] || (echo "should be run by slurm"; exit 1)
+   rank=$SLURM_PROCID
+
+   baseport=12012
+
+   i0=$((nvec * rank / nserv))
+   i1=$((nvec * (rank + 1) / nserv))
+   port=$(( baseport + rank ))
+
+   echo "host $(hostname) start server $rank for range $i0:$i1 port $port"
+
+   if [ $rank != 0 ]; then
+
+       python -u distributed_kmeans.py \
+              --indata $traindata \
+              --i0 $i0 --i1 $i1 \
+              --server --gpu -1 \
+              --port $port --ipv4
+   else
+       # master process
+
+       # kill background processes on output of this script
+       trap 'kill -HUP 0' 0
+
+       python -u distributed_kmeans.py \
+              --indata $traindata \
+              --i0 $i0 --i1 $i1 \
+              --server --gpu -1 \
+              --port $port --ipv4 &
+
+       # Slurm has a somewhat convoluted way of specifying the nodes
+       # assigned to each task. This is to parse the SLURM_TASKS_PER_NODE variable
+       function parse_tasks_per_node () {
+           local blocks=$1
+           for block in ${blocks//,/ }; do
+               if [ ${block/x/} != $block ]; then
+                   tpn="${block%(*}"
+                   repeat=${block#*x}
+                   repeat=${repeat%?}
+                   for((i=0;i<repeat;i++)); do
+                       echo $tpn
+                   done
+               else
+                   echo $block
+               fi
+            done
+       }
+
+       hostports=""
+       port=$baseport
+       echo VARS $SLURM_TASKS_PER_NODE $SLURM_JOB_NODELIST
+       tasks_per_node=( $( parse_tasks_per_node $SLURM_TASKS_PER_NODE ) )
+       nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+       n=${#nodes[*]}
+       for((i=0;i<n;i++)); do
+           hostname=${nodes[i]}
+           for((j=0;j<tasks_per_node[i];j++)); do
+               hostports="$hostports $hostname:$port"
+               ((port++))
+           done
+       done
+
+       echo HOSTPORTS $hostports
+
+       sleep 20s
+
+       # run client
+       python distributed_kmeans.py \
+           --client --servers "$hostports" \
+           --k $k --ipv4 "$@"
+
+       echo "Done, kill the job"
+       scancel $SLURM_JOBID
+
+   fi
+
+elif [ $todo == deep1b_clustering ]; then
+    # also set nvec=500M and k=10M in the top of the file
+    nserv=20
+
+    srun -n$nserv \
+         --time=48:00:00 \
+         --cpus-per-task=40 --gres=gpu:4 --mem=100G \
+         --partition=priority --comment='priority is the only one that works'  \
+         -l bash $( realpath $0 ) slurm_within_kmeans_server \
+         --out $workdir/1M_centroids.npy
+
+elif [ $todo == make_index_vslices ]; then
+
+    # vslice: slice per database shards
+
+    nvec=1000000000
+    nslice=200
+
+    for((i=0;i<nslice;i++)); do
+        i0=$((nvec * i / nslice))
+        i1=$((nvec * (i + 1) / nslice))
+
+        # make the script to be run by sbatch
+        cat > $workdir/vslices/slice$i.bash <<EOF
+#!/bin/bash
+
+srun python -u make_index_vslice.py \
+                 --inputindex $workdir/trained.faissindex \
+                 --input $deep1bdir/base.fvecs \
+                 --nt 40 \
+                 --i0 $i0 --i1 $i1 \
+                 -o $workdir/vslices/slice$i.faissindex
+
+EOF
+        # specify resources for script and run it
+        sbatch -n1 \
+             --time=48:00:00 \
+             --cpus-per-task=40 --gres=gpu:0 --mem=200G \
+             --output=$workdir/vslices/slice$i.log \
+             --job-name=vslice$i.c \
+             $workdir/vslices/slice$i.bash
+        echo "logs in $workdir/vslices/slice$i.log"
+
+    done
+
+elif [ $todo == make_index_hslices ]; then
+
+    # hslice: slice per inverted lists
+
+    nlist=1000000
+    nslice=50
+
+    for((i=0;i<nslice;i++)); do
+        i0=$((nlist * i / nslice))
+        i1=$((nlist * (i + 1) / nslice))
+
+        # make the script to be run by sbatch
+        cat > $workdir/hslices/slice$i.bash <<EOF
+#!/bin/bash
+
+srun python -u merge_to_ondisk.py \
+                 --input $workdir/vslices/slice{0..199}.faissindex \
+                 --nt 20 \
+                 --l0 $i0 --l1 $i1 \
+                 --output $workdir/hslices/slice$i.faissindex \
+                 --outputIL $workdir/hslices/slice$i.invlists
+
+
+EOF
+        # specify resources for script and run it
+        sbatch -n1 \
+             --time=48:00:00 \
+             --cpus-per-task=20 --gres=gpu:0 --mem=200G \
+             --output=$workdir/hslices/slice$i.log \
+             --job-name=hslice$i.a \
+             --constraint=pascal \
+             $workdir/hslices/slice$i.bash
+        echo "logs in $workdir/hslices/slice$i.log"
+
+    done
+
+elif [ $todo == run_search_servers ]; then
+
+    nserv=3
+
+    srun -n$nserv \
+         --time=48:00:00 \
+         --cpus-per-task=64 --gres=gpu:0 --mem=100G \
+         --constraint=pascal \
+         --partition=priority --comment='priority is the only one that works'  \
+         -l python -u search_server.py --port 12012
+
+
+else
+    echo "unknown todo $todo"
+    exit 1
+fi
--- a/benchs/distributed_ondisk/search_server.py
+++ b/benchs/distributed_ondisk/search_server.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import time
+import rpc
+import combined_index
+
+import argparse
+
+
+############################################################
+# Server implementation
+############################################################
+
+
+class MyServer(rpc.Server):
+    """ Assign version that can be exposed via RPC """
+    def __init__(self, s, index):
+        rpc.Server.__init__(self, s)
+        self.index = index
+
+    def __getattr__(self, f):
+        return getattr(self.index, f)
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('server options')
+    aa('--port', default=12012, type=int, help='server port')
+    aa('--when_ready_dir', default=None,
+       help='store host:port to this file when ready')
+    aa('--ipv4', default=False, action='store_true', help='force ipv4')
+    aa('--rank', default=0, type=int,
+       help='rank used as index in the client table')
+
+    args = parser.parse_args()
+
+    when_ready = None
+    if args.when_ready_dir:
+        when_ready = '%s/%d' % (args.when_ready_dir, args.rank)
+
+    print('loading index')
+
+    index = combined_index.CombinedIndexDeep1B()
+
+    print('starting server')
+    rpc.run_server(
+        lambda s: MyServer(s, index),
+        args.port, report_to_file=when_ready,
+        v6=not args.ipv4)
+
+if __name__ == '__main__':
+    main()
+
+
+############################################################
+# Client implementation
+############################################################
+
+from multiprocessing.dummy import Pool as ThreadPool
+import faiss
+import numpy as np
+
+
+
+class ResultHeap:
+    """ Combine query results from a sliced dataset (for k-nn search) """
+
+    def __init__(self, nq, k):
+        " nq: number of query vectors, k: number of results per query "
+        self.I = np.zeros((nq, k), dtype='int64')
+        self.D = np.zeros((nq, k), dtype='float32')
+        self.nq, self.k = nq, k
+        heaps = faiss.float_maxheap_array_t()
+        heaps.k = k
+        heaps.nh = nq
+        heaps.val = faiss.swig_ptr(self.D)
+        heaps.ids = faiss.swig_ptr(self.I)
+        heaps.heapify()
+        self.heaps = heaps
+
+    def add_batch_result(self, D, I, i0):
+        assert D.shape == (self.nq, self.k)
+        assert I.shape == (self.nq, self.k)
+        I += i0
+        self.heaps.addn_with_ids(
+            self.k, faiss.swig_ptr(D),
+            faiss.swig_ptr(I), self.k)
+
+    def finalize(self):
+        self.heaps.reorder()
+
+def distribute_weights(weights, nbin):
+    """ assign a set of weights to a smaller set of bins to balance them """
+    nw = weights.size
+    o = weights.argsort()
+    bins = np.zeros(nbin)
+    assign = np.ones(nw, dtype=int)
+    for i in o[::-1]:
+        b = bins.argmin()
+        assign[i] = b
+        bins[b] += weights[i]
+    return bins, assign
+
+
+
+class SplitPerListIndex:
+    """manages a local index, that does the coarse quantization and a set
+    of sub_indexes. The sub_indexes search a subset of the inverted
+    lists. The SplitPerListIndex merges results from the sub-indexes"""
+
+    def __init__(self, index, sub_indexes):
+        self.index = index
+        self.code_size = faiss.extract_index_ivf(index.index).code_size
+        self.sub_indexes = sub_indexes
+        self.ni = len(self.sub_indexes)
+        # pool of threads. Each thread manages one sub-index.
+        self.pool = ThreadPool(self.ni)
+        self.verbose = False
+
+    def set_nprobe(self, nprobe):
+        self.index.set_nprobe(nprobe)
+        self.pool.map(
+            lambda i: self.sub_indexes[i].set_nprobe(nprobe),
+            range(self.ni)
+        )
+
+    def set_omp_num_threads(self, nt):
+        faiss.omp_set_num_threads(nt)
+        self.pool.map(
+            lambda idx: idx.set_omp_num_threads(nt),
+            self.sub_indexes
+        )
+
+    def set_parallel_mode(self, pm):
+        self.index.set_parallel_mode(pm)
+        self.pool.map(
+            lambda idx: idx.set_parallel_mode(pm),
+            self.sub_indexes
+        )
+
+    def set_prefetch_nthread(self, nt):
+        self.index.set_prefetch_nthread(nt)
+        self.pool.map(
+            lambda idx: idx.set_prefetch_nthread(nt),
+            self.sub_indexes
+        )
+
+    def balance_lists(self, list_nos):
+        big_il = self.index.big_il
+        weights = np.array([big_il.list_size(int(i))
+                            for i in list_nos.ravel()])
+        bins, assign = distribute_weights(weights, self.ni)
+        if self.verbose:
+            print('bins weight range %d:%d total %d (%.2f MiB)' % (
+                bins.min(), bins.max(), bins.sum(),
+                bins.sum() * (self.code_size + 8) / 2 ** 20))
+        self.nscan = bins.sum()
+        return assign.reshape(list_nos.shape)
+
+    def search(self, x, k):
+        xqo, list_nos, coarse_dis = self.index.transform_and_assign(x)
+        assign = self.balance_lists(list_nos)
+
+        def do_query(i):
+            sub_index = self.sub_indexes[i]
+            list_nos_i = list_nos.copy()
+            list_nos_i[assign != i] = -1
+            t0 = time.time()
+            Di, Ii = sub_index.ivf_search_preassigned(
+                xqo, list_nos_i, coarse_dis, k)
+            #print(list_nos_i, Ii)
+            if self.verbose:
+                print('client %d: %.3f s' % (i, time.time() - t0))
+            return Di, Ii
+
+        rh = ResultHeap(x.shape[0], k)
+
+        for Di, Ii in self.pool.imap(do_query, range(self.ni)):
+            #print("ADD", Ii, rh.I)
+            rh.add_batch_result(Di, Ii, 0)
+        rh.finalize()
+        return rh.D, rh.I
+
+    def range_search(self, x, radius):
+        xqo, list_nos, coarse_dis = self.index.transform_and_assign(x)
+        assign = self.balance_lists(list_nos)
+        nq = len(x)
+
+        def do_query(i):
+            sub_index = self.sub_indexes[i]
+            list_nos_i = list_nos.copy()
+            list_nos_i[assign != i] = -1
+            t0 = time.time()
+            limi, Di, Ii = sub_index.ivf_range_search_preassigned(
+                xqo, list_nos_i, coarse_dis, radius)
+            if self.verbose:
+                print('slice %d: %.3f s' % (i, time.time() - t0))
+            return limi, Di, Ii
+
+        D = [[] for i in range(nq)]
+        I = [[] for i in range(nq)]
+
+        sizes = np.zeros(nq, dtype=int)
+        for lims, Di, Ii in self.pool.imap(do_query, range(self.ni)):
+            for i in range(nq):
+                l0, l1 = lims[i:i + 2]
+                D[i].append(Di[l0:l1])
+                I[i].append(Ii[l0:l1])
+                sizes[i] += l1 - l0
+        lims = np.zeros(nq + 1, dtype=int)
+        lims[1:] = np.cumsum(sizes)
+        D = np.hstack([j for i in D for j in i])
+        I = np.hstack([j for i in I for j in i])
+        return lims, D, I
--- a/benchs/kmeans_mnist.py
+++ b/benchs/kmeans_mnist.py
+#! /usr/bin/env python2
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import print_function
+import numpy as np
+import time
+import faiss
+import sys
+
+
+# Get command-line arguments
+
+k = int(sys.argv[1])
+ngpu = int(sys.argv[2])
+
+# Load Leon's file format
+
+def load_mnist(fname):
+    print("load", fname)
+    f = open(fname)
+
+    header = np.fromfile(f, dtype='int8', count=4*4)
+    header = header.reshape(4, 4)[:, ::-1].copy().view('int32')
+    print(header)
+    nim, xd, yd = [int(x) for x in header[1:]]
+
+    data = np.fromfile(f, count=nim * xd * yd,
+                       dtype='uint8')
+
+    print(data.shape, nim, xd, yd)
+    data = data.reshape(nim, xd, yd)
+    return data
+
+basedir = "/path/to/mnist/data"
+
+x = load_mnist(basedir + 'mnist8m/mnist8m-patterns-idx3-ubyte')
+
+print("reshape")
+
+x = x.reshape(x.shape[0], -1).astype('float32')
+
+
+def train_kmeans(x, k, ngpu):
+    "Runs kmeans on one or several GPUs"
+    d = x.shape[1]
+    clus = faiss.Clustering(d, k)
+    clus.verbose = True
+    clus.niter = 20
+
+    # otherwise the kmeans implementation sub-samples the training set
+    clus.max_points_per_centroid = 10000000
+
+    res = [faiss.StandardGpuResources() for i in range(ngpu)]
+
+    flat_config = []
+    for i in range(ngpu):
+        cfg = faiss.GpuIndexFlatConfig()
+        cfg.useFloat16 = False
+        cfg.device = i
+        flat_config.append(cfg)
+
+    if ngpu == 1:
+        index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
+    else:
+        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
+                   for i in range(ngpu)]
+        index = faiss.IndexReplicas()
+        for sub_index in indexes:
+            index.addIndex(sub_index)
+
+    # perform the training
+    clus.train(x, index)
+    centroids = faiss.vector_float_to_array(clus.centroids)
+
+    obj = faiss.vector_float_to_array(clus.obj)
+    print("final objective: %.4g" % obj[-1])
+
+    return centroids.reshape(k, d)
+
+print("run")
+t0 = time.time()
+train_kmeans(x, k, ngpu)
+t1 = time.time()
+
+print("total runtime: %.3f s" % (t1 - t0))
--- a/benchs/link_and_code/README.md
+++ b/benchs/link_and_code/README.md
+
+
+README for the link & code implementation
+=========================================
+
+What is this?
+-------------
+
+Link & code is an indexing method that combines HNSW indexing with
+compression and exploits the neighborhood structure of the similarity
+graph to improve the reconstruction. It is described in
+
+```
+@inproceedings{link_and_code,
+   author = {Matthijs Douze and Alexandre Sablayrolles and Herv\'e J\'egou},
+   title = {Link and code: Fast indexing with graphs and compact regression codes},
+   booktitle = {CVPR},
+   year = {2018}
+}
+```
+
+ArXiV [here](https://arxiv.org/abs/1804.09996)
+
+Code structure
+--------------
+
+The test runs with 3 files:
+
+- `bench_link_and_code.py`: driver script
+
+- `datasets.py`: code to load the datasets. The example code runs on the
+  deep1b and bigann datasets. See the [toplevel README](../README.md)
+  on how to downlod them. They should be put in a directory, edit
+  datasets.py to set the path.
+
+- `neighbor_codec.py`: this is where the representation is trained.
+
+The code runs on top of Faiss. The HNSW index can be extended with a
+`ReconstructFromNeighbors` C++ object that refines the distances. The
+training is implemented in Python.
+
+
+Reproducing Table 2 in the paper
+--------------------------------
+
+The results of table 2 (accuracy on deep100M) in the paper can be
+obtained with:
+
+```
+python bench_link_and_code.py \
+   --db deep100M \
+   --M0 6 \
+   --indexkey OPQ36_144,HNSW32_PQ36 \
+   --indexfile $bdir/deep100M_PQ36_L6.index \
+   --beta_nsq 4  \
+   --beta_centroids $bdir/deep100M_PQ36_L6_nsq4.npy \
+   --neigh_recons_codes $bdir/deep100M_PQ36_L6_nsq4_codes.npy \
+   --k_reorder 0,5 --efSearch 1,1024
+```
+
+Set `bdir` to a scratch directory.
+
+Explanation of the flags:
+
+- `--db deep1M`: dataset to process
+
+- `--M0 6`: number of links on the base level (L6)
+
+- `--indexkey OPQ36_144,HNSW32_PQ36`: Faiss index key to construct the
+  HNSW structure. It means that vectors are transformed by OPQ and
+  encoded with PQ 36x8 (with an intermediate size of 144D). The HNSW
+  level>0 nodes have 32 links (theses ones are "cheap" to store
+  because there are fewer nodes in the upper levels.
+
+- `--indexfile $bdir/deep1M_PQ36_M6.index`: name of the index file
+  (without information for the L&C extension)
+
+- `--beta_nsq 4`: number of bytes to allocate for the codes (M in the
+  paper)
+
+- `--beta_centroids $bdir/deep1M_PQ36_M6_nsq4.npy`: filename to store
+  the trained beta centroids
+
+- `--neigh_recons_codes $bdir/deep1M_PQ36_M6_nsq4_codes.npy`: filename
+  for the encoded weights (beta) of the combination
+
+- `--k_reorder 0,5`: number of restults to reorder. 0 = baseline
+  without reordering, 5 = value used throughout the paper
+
+- `--efSearch 1,1024`: number of nodes to visit (T in the paper)
+
+The script will proceed with the following steps:
+
+0. load dataset (and possibly compute the ground-truth if the
+ground-truth file is not provided)
+
+1. train the OPQ encoder
+
+2. build the index and store it
+
+3. compute the residuals and train the beta vocabulary to do the reconstuction
+
+4. encode the vertices
+
+5. search and evaluate the search results.
+
+With option `--exhaustive` the results of the exhaustive column can be
+obtained.
+
+The run above should output:
+```
+...
+setting k_reorder=5
+...
+efSearch=1024      0.3132 ms per query,  R@1: 0.4283 R@10: 0.6337 R@100: 0.6520 ndis 40941919 nreorder 50000
+
+```
+which matches the paper's table 2.
+
+Note that in multi-threaded mode, the building of the HNSW strcuture
+is not deterministic. Therefore, the results across runs may not be exactly the same.
+
+Reproducing Figure 5 in the paper
+---------------------------------
+
+Figure 5 just evaluates the combination of HNSW and PQ. For example,
+the operating point L6&OPQ40 can be obtained with
+
+```
+python bench_link_and_code.py \
+   --db deep1M \
+   --M0 6 \
+   --indexkey OPQ40_160,HNSW32_PQ40 \
+   --indexfile $bdir/deep1M_PQ40_M6.index \
+   --beta_nsq 1 --beta_k 1  \
+   --beta_centroids $bdir/deep1M_PQ40_M6_nsq0.npy \
+   --neigh_recons_codes $bdir/deep1M_PQ36_M6_nsq0_codes.npy \
+   --k_reorder 0 --efSearch 16,64,256,1024
+```
+
+The arguments are similar to the previous table. Note that nsq = 0 is
+simulated by setting beta_nsq = 1 and beta_k = 1 (ie a code with a single
+reproduction value).
+
+The output should look like:
+
+```
+setting k_reorder=0
+efSearch=16        0.0147 ms per query,  R@1: 0.3409 R@10: 0.4388 R@100: 0.4394 ndis 2629735 nreorder 0
+efSearch=64        0.0122 ms per query,  R@1: 0.4836 R@10: 0.6490 R@100: 0.6509 ndis 4623221 nreorder 0
+efSearch=256       0.0344 ms per query,  R@1: 0.5730 R@10: 0.7915 R@100: 0.7951 ndis 11090176 nreorder 0
+efSearch=1024      0.2656 ms per query,  R@1: 0.6212 R@10: 0.8722 R@100: 0.8765 ndis 33501951 nreorder 0
+```
+
+The results with k_reorder=5 are not reported in the paper, they
+represent the performance of a "free coding" version of the algorithm.
--- a/benchs/link_and_code/bench_link_and_code.py
+++ b/benchs/link_and_code/bench_link_and_code.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from __future__ import print_function
+import os
+import sys
+import time
+import numpy as np
+import re
+import faiss
+from multiprocessing.dummy import Pool as ThreadPool
+import pdb
+import argparse
+import datasets
+from datasets import sanitize
+import neighbor_codec
+
+######################################################
+# Command-line parsing
+######################################################
+
+
+parser = argparse.ArgumentParser()
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+group = parser.add_argument_group('dataset options')
+
+aa('--db', default='deep1M', help='dataset')
+aa( '--compute_gt', default=False, action='store_true',
+    help='compute and store the groundtruth')
+
+group = parser.add_argument_group('index consturction')
+
+aa('--indexkey', default='HNSW32', help='index_factory type')
+aa('--efConstruction', default=200, type=int,
+   help='HNSW construction factor')
+aa('--M0', default=-1, type=int, help='size of base level')
+aa('--maxtrain', default=256 * 256, type=int,
+   help='maximum number of training points')
+aa('--indexfile', default='', help='file to read or write index from')
+aa('--add_bs', default=-1, type=int,
+   help='add elements index by batches of this size')
+aa('--link_singletons', default=False, action='store_true',
+   help='do a pass to link in the singletons')
+
+group = parser.add_argument_group(
+    'searching (reconstruct_from_neighbors options)')
+
+aa('--beta_centroids', default='',
+   help='file with codebook')
+aa('--neigh_recons_codes', default='',
+   help='file with codes for reconstruction')
+aa('--beta_ntrain', default=250000, type=int, help='')
+aa('--beta_k', default=256, type=int, help='beta codebook size')
+aa('--beta_nsq', default=1, type=int, help='number of beta sub-vectors')
+aa('--beta_niter', default=10, type=int, help='')
+aa('--k_reorder', default='-1', help='')
+
+group = parser.add_argument_group('searching')
+
+aa('--k', default=100, type=int, help='nb of nearest neighbors')
+aa('--exhaustive', default=False, action='store_true',
+    help='report the exhaustive search topline')
+aa('--searchthreads', default=-1, type=int,
+   help='nb of threads to use at search time')
+aa('--efSearch', default='', type=str,
+   help='comma-separated values of efSearch to try')
+
+args = parser.parse_args()
+
+print("args:", args)
+
+
+######################################################
+# Load dataset
+######################################################
+
+xt, xb, xq, gt = datasets.load_data(
+    dataset=args.db, compute_gt=args.compute_gt)
+
+nq, d = xq.shape
+nb, d = xb.shape
+
+
+######################################################
+# Make index
+######################################################
+
+if os.path.exists(args.indexfile):
+
+    print("reading", args.indexfile)
+    index = faiss.read_index(args.indexfile)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        index_hnsw = faiss.downcast_index(index.index)
+        vec_transform = index.chain.at(0).apply_py
+    else:
+        index_hnsw = index
+        vec_transform = lambda x:x
+
+    hnsw = index_hnsw.hnsw
+    hnsw_stats = faiss.cvar.hnsw_stats
+
+else:
+
+    print("build index, key=", args.indexkey)
+
+    index = faiss.index_factory(d, args.indexkey)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        index_hnsw = faiss.downcast_index(index.index)
+        vec_transform = index.chain.at(0).apply_py
+    else:
+        index_hnsw = index
+        vec_transform = lambda x:x
+
+    hnsw = index_hnsw.hnsw
+    hnsw.efConstruction = args.efConstruction
+    hnsw_stats = faiss.cvar.hnsw_stats
+    index.verbose = True
+    index_hnsw.verbose = True
+    index_hnsw.storage.verbose = True
+
+    if args.M0 != -1:
+        print("set level 0 nb of neighbors to", args.M0)
+        hnsw.set_nb_neighbors(0, args.M0)
+
+    xt2 = sanitize(xt[:args.maxtrain])
+    assert np.all(np.isfinite(xt2))
+
+    print("train, size", xt.shape)
+    t0 = time.time()
+    index.train(xt2)
+    print("  train in %.3f s" % (time.time() - t0))
+
+    print("adding")
+    t0 = time.time()
+    if args.add_bs == -1:
+        index.add(sanitize(xb))
+    else:
+        for i0 in range(0, nb, args.add_bs):
+            i1 = min(nb, i0 + args.add_bs)
+            print("  adding %d:%d / %d" % (i0, i1, nb))
+            index.add(sanitize(xb[i0:i1]))
+
+    print("  add in %.3f s" % (time.time() - t0))
+    print("storing", args.indexfile)
+    faiss.write_index(index, args.indexfile)
+
+
+######################################################
+# Train beta centroids and encode dataset
+######################################################
+
+if args.beta_centroids:
+    print("reordering links")
+    index_hnsw.reorder_links()
+
+    if os.path.exists(args.beta_centroids):
+        print("load", args.beta_centroids)
+        beta_centroids = np.load(args.beta_centroids)
+        nsq, k, M1 = beta_centroids.shape
+        assert M1 == hnsw.nb_neighbors(0) + 1
+
+        rfn = faiss.ReconstructFromNeighbors(index_hnsw, k, nsq)
+    else:
+        print("train beta centroids")
+        rfn = faiss.ReconstructFromNeighbors(
+            index_hnsw, args.beta_k, args.beta_nsq)
+
+        xb_full = vec_transform(sanitize(xb[:args.beta_ntrain]))
+
+        beta_centroids = neighbor_codec.train_beta_codebook(
+            rfn, xb_full, niter=args.beta_niter)
+
+        print("  storing", args.beta_centroids)
+        np.save(args.beta_centroids, beta_centroids)
+
+
+    faiss.copy_array_to_vector(beta_centroids.ravel(),
+                               rfn.codebook)
+    index_hnsw.reconstruct_from_neighbors = rfn
+
+    if rfn.k == 1:
+        pass     # no codes to take care of
+    elif os.path.exists(args.neigh_recons_codes):
+        print("loading neigh codes", args.neigh_recons_codes)
+        codes = np.load(args.neigh_recons_codes)
+        assert codes.size == rfn.code_size * index.ntotal
+        faiss.copy_array_to_vector(codes.astype('uint8'),
+                                   rfn.codes)
+        rfn.ntotal = index.ntotal
+    else:
+        print("encoding neigh codes")
+        t0 = time.time()
+
+        bs = 1000000 if args.add_bs == -1 else args.add_bs
+
+        for i0 in range(0, nb, bs):
+            i1 = min(i0 + bs, nb)
+            print("   encode %d:%d / %d [%.3f s]\r" % (
+                i0, i1, nb, time.time() - t0), end=' ')
+            sys.stdout.flush()
+            xbatch = vec_transform(sanitize(xb[i0:i1]))
+            rfn.add_codes(i1 - i0, faiss.swig_ptr(xbatch))
+        print()
+
+        print("storing %s" % args.neigh_recons_codes)
+        codes = faiss.vector_to_array(rfn.codes)
+        np.save(args.neigh_recons_codes, codes)
+
+######################################################
+# Exhaustive evaluation
+######################################################
+
+if args.exhaustive:
+    print("exhaustive evaluation")
+    xq_tr = vec_transform(sanitize(xq))
+    index2 = faiss.IndexFlatL2(index_hnsw.d)
+    accu_recons_error = 0.0
+
+    if faiss.get_num_gpus() > 0:
+        print("do eval on GPU")
+        co = faiss.GpuMultipleClonerOptions()
+        co.shard = False
+        index2 = faiss.index_cpu_to_all_gpus(index2, co)
+
+    # process in batches in case the dataset does not fit in RAM
+    rh = datasets.ResultHeap(xq_tr.shape[0], 100)
+    t0 = time.time()
+    bs = 500000
+    for i0 in range(0, nb, bs):
+        i1 = min(nb, i0 + bs)
+        print('  handling batch %d:%d' % (i0, i1))
+
+        xb_recons = np.empty(
+            (i1 - i0, index_hnsw.d), dtype='float32')
+        rfn.reconstruct_n(i0, i1 - i0, faiss.swig_ptr(xb_recons))
+
+        accu_recons_error += (
+            (vec_transform(sanitize(xb[i0:i1])) -
+             xb_recons)**2).sum()
+
+        index2.reset()
+        index2.add(xb_recons)
+        D, I = index2.search(xq_tr, 100)
+        rh.add_batch_result(D, I, i0)
+
+    rh.finalize()
+    del index2
+    t1 = time.time()
+    print("done in %.3f s" % (t1 - t0))
+    print("total reconstruction error: ", accu_recons_error)
+    print("eval retrieval:")
+    datasets.evaluate_DI(rh.D, rh.I, gt)
+
+
+def get_neighbors(hnsw, i, level):
+    " list the neighbors for node i at level "
+    assert i < hnsw.levels.size()
+    assert level < hnsw.levels.at(i)
+    be = np.empty(2, 'uint64')
+    hnsw.neighbor_range(i, level, faiss.swig_ptr(be), faiss.swig_ptr(be[1:]))
+    return [hnsw.neighbors.at(j) for j in range(be[0], be[1])]
+
+
+#############################################################
+# Index is ready
+#############################################################
+
+xq = sanitize(xq)
+
+if args.searchthreads != -1:
+    print("Setting nb of threads to", args.searchthreads)
+    faiss.omp_set_num_threads(args.searchthreads)
+
+
+if gt is None:
+    print("no valid groundtruth -- exit")
+    sys.exit()
+
+
+k_reorders = [int(x) for x in args.k_reorder.split(',')]
+efSearchs = [int(x) for x in args.efSearch.split(',')]
+
+
+for k_reorder in k_reorders:
+
+    if index_hnsw.reconstruct_from_neighbors:
+        print("setting k_reorder=%d" % k_reorder)
+        index_hnsw.reconstruct_from_neighbors.k_reorder = k_reorder
+
+    for efSearch in efSearchs:
+        print("efSearch=%-4d" % efSearch, end=' ')
+        hnsw.efSearch = efSearch
+        hnsw_stats.reset()
+        datasets.evaluate(xq, gt, index, k=args.k, endl=False)
+
+        print("ndis %d nreorder %d" % (hnsw_stats.ndis, hnsw_stats.nreorder))
--- a/benchs/link_and_code/datasets.py
+++ b/benchs/link_and_code/datasets.py
+#! /usr/bin/env python2
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Common functions to load datasets and compute their ground-truth
+"""
+from __future__ import print_function
+
+import time
+import numpy as np
+import faiss
+import pdb
+import sys
+
+# set this to the directory that contains the datafiles.
+# deep1b data should be at simdir + 'deep1b'
+# bigann data should be at simdir + 'bigann'
+simdir = '/mnt/vol/gfsai-east/ai-group/datasets/simsearch/'
+
+#################################################################
+# Small I/O functions
+#################################################################
+
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype='int32')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view('float32')
+
+
+def ivecs_mmap(fname):
+    a = np.memmap(fname, dtype='int32', mode='r')
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:]
+
+
+def fvecs_mmap(fname):
+    return ivecs_mmap(fname).view('float32')
+
+
+def bvecs_mmap(fname):
+    x = np.memmap(fname, dtype='uint8', mode='r')
+    d = x[:4].view('int32')[0]
+    return x.reshape(-1, d + 4)[:, 4:]
+
+
+def ivecs_write(fname, m):
+    n, d = m.shape
+    m1 = np.empty((n, d + 1), dtype='int32')
+    m1[:, 0] = d
+    m1[:, 1:] = m
+    m1.tofile(fname)
+
+
+def fvecs_write(fname, m):
+    m = m.astype('float32')
+    ivecs_write(fname, m.view('int32'))
+
+
+#################################################################
+# Dataset
+#################################################################
+
+def sanitize(x):
+    return np.ascontiguousarray(x, dtype='float32')
+
+
+class ResultHeap:
+    """ Combine query results from a sliced dataset """
+
+    def __init__(self, nq, k):
+        " nq: number of query vectors, k: number of results per query "
+        self.I = np.zeros((nq, k), dtype='int64')
+        self.D = np.zeros((nq, k), dtype='float32')
+        self.nq, self.k = nq, k
+        heaps = faiss.float_maxheap_array_t()
+        heaps.k = k
+        heaps.nh = nq
+        heaps.val = faiss.swig_ptr(self.D)
+        heaps.ids = faiss.swig_ptr(self.I)
+        heaps.heapify()
+        self.heaps = heaps
+
+    def add_batch_result(self, D, I, i0):
+        assert D.shape == (self.nq, self.k)
+        assert I.shape == (self.nq, self.k)
+        I += i0
+        self.heaps.addn_with_ids(
+            self.k, faiss.swig_ptr(D),
+            faiss.swig_ptr(I), self.k)
+
+    def finalize(self):
+        self.heaps.reorder()
+
+
+
+def compute_GT_sliced(xb, xq, k):
+    print("compute GT")
+    t0 = time.time()
+    nb, d = xb.shape
+    nq, d = xq.shape
+    rh = ResultHeap(nq, k)
+    bs = 10 ** 5
+
+    xqs = sanitize(xq)
+
+    db_gt = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
+
+    # compute ground-truth by blocks of bs, and add to heaps
+    for i0 in range(0, nb, bs):
+        i1 = min(nb, i0 + bs)
+        xsl = sanitize(xb[i0:i1])
+        db_gt.add(xsl)
+        D, I = db_gt.search(xqs, k)
+        rh.add_batch_result(D, I, i0)
+        db_gt.reset()
+        print("\r   %d/%d, %.3f s" % (i0, nb, time.time() - t0), end=' ')
+        sys.stdout.flush()
+    print()
+    rh.finalize()
+    gt_I = rh.I
+
+    print("GT time: %.3f s" % (time.time() - t0))
+    return gt_I
+
+
+def do_compute_gt(xb, xq, k):
+    print("computing GT")
+    nb, d = xb.shape
+    index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
+    if nb < 100 * 1000:
+        print("   add")
+        index.add(np.ascontiguousarray(xb, dtype='float32'))
+        print("   search")
+        D, I = index.search(np.ascontiguousarray(xq, dtype='float32'), k)
+    else:
+        I = compute_GT_sliced(xb, xq, k)
+
+    return I.astype('int32')
+
+
+def load_data(dataset='deep1M', compute_gt=False):
+
+    print("load data", dataset)
+
+    if dataset == 'sift1M':
+        basedir = simdir + 'sift1M/'
+
+        xt = fvecs_read(basedir + "sift_learn.fvecs")
+        xb = fvecs_read(basedir + "sift_base.fvecs")
+        xq = fvecs_read(basedir + "sift_query.fvecs")
+        gt = ivecs_read(basedir + "sift_groundtruth.ivecs")
+
+    elif dataset.startswith('bigann'):
+        basedir = simdir + 'bigann/'
+
+        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
+        xb = bvecs_mmap(basedir + 'bigann_base.bvecs')
+        xq = bvecs_mmap(basedir + 'bigann_query.bvecs')
+        xt = bvecs_mmap(basedir + 'bigann_learn.bvecs')
+        # trim xb to correct size
+        xb = xb[:dbsize * 1000 * 1000]
+        gt = ivecs_read(basedir + 'gnd/idx_%dM.ivecs' % dbsize)
+
+    elif dataset.startswith("deep"):
+        basedir = simdir + 'deep1b/'
+        szsuf = dataset[4:]
+        if szsuf[-1] == 'M':
+            dbsize = 10 ** 6 * int(szsuf[:-1])
+        elif szsuf == '1B':
+            dbsize = 10 ** 9
+        elif szsuf[-1] == 'k':
+            dbsize = 1000 * int(szsuf[:-1])
+        else:
+            assert False, "did not recognize suffix " + szsuf
+
+        xt = fvecs_mmap(basedir + "learn.fvecs")
+        xb = fvecs_mmap(basedir + "base.fvecs")
+        xq = fvecs_read(basedir + "deep1B_queries.fvecs")
+
+        xb = xb[:dbsize]
+
+        gt_fname = basedir + "%s_groundtruth.ivecs" % dataset
+        if compute_gt:
+            gt = do_compute_gt(xb, xq, 100)
+            print("store", gt_fname)
+            ivecs_write(gt_fname, gt)
+
+        gt = ivecs_read(gt_fname)
+
+    else:
+        assert False
+
+    print("dataset %s sizes: B %s Q %s T %s" % (
+        dataset, xb.shape, xq.shape, xt.shape))
+
+    return xt, xb, xq, gt
+
+#################################################################
+# Evaluation
+#################################################################
+
+
+def evaluate_DI(D, I, gt):
+    nq = gt.shape[0]
+    k = I.shape[1]
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+
+
+def evaluate(xq, gt, index, k=100, endl=True):
+    t0 = time.time()
+    D, I = index.search(xq, k)
+    t1 = time.time()
+    nq = xq.shape[0]
+    print("\t %8.4f ms per query, " % (
+        (t1 - t0) * 1000.0 / nq), end=' ')
+    rank = 1
+    while rank <= k:
+        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
+        print("R@%d: %.4f" % (rank, recall), end=' ')
+        rank *= 10
+    if endl:
+        print()
+    return D, I
--- a/benchs/link_and_code/neighbor_codec.py
+++ b/benchs/link_and_code/neighbor_codec.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This is the training code for the link and code. Especially the
+neighbors_kmeans function implements the EM-algorithm to find the
+appropriate weightings and cluster them.
+"""
+from __future__ import print_function
+
+import time
+import numpy as np
+import faiss
+
+#----------------------------------------------------------
+# Utils
+#----------------------------------------------------------
+
+def sanitize(x):
+    return np.ascontiguousarray(x, dtype='float32')
+
+
+def train_kmeans(x, k, ngpu, max_points_per_centroid=256):
+    "Runs kmeans on one or several GPUs"
+    d = x.shape[1]
+    clus = faiss.Clustering(d, k)
+    clus.verbose = True
+    clus.niter = 20
+    clus.max_points_per_centroid = max_points_per_centroid
+
+    if ngpu == 0:
+        index = faiss.IndexFlatL2(d)
+    else:
+        res = [faiss.StandardGpuResources() for i in range(ngpu)]
+
+        flat_config = []
+        for i in range(ngpu):
+            cfg = faiss.GpuIndexFlatConfig()
+            cfg.useFloat16 = False
+            cfg.device = i
+            flat_config.append(cfg)
+
+        if ngpu == 1:
+            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
+        else:
+            indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
+                       for i in range(ngpu)]
+            index = faiss.IndexReplicas()
+            for sub_index in indexes:
+                index.addIndex(sub_index)
+
+    # perform the training
+    clus.train(x, index)
+    centroids = faiss.vector_float_to_array(clus.centroids)
+
+    stats = clus.iteration_stats
+    stats = [stats.at(i) for i in range(stats.size())]
+    obj = np.array([st.obj for st in stats])
+    print("final objective: %.4g" % obj[-1])
+
+    return centroids.reshape(k, d)
+
+
+#----------------------------------------------------------
+# Learning the codebook from neighbors
+#----------------------------------------------------------
+
+
+# works with both a full Inn table and dynamically generated neighbors
+
+def get_Inn_shape(Inn):
+    if type(Inn) != tuple:
+        return Inn.shape
+    return Inn[:2]
+
+def get_neighbor_table(x_coded, Inn, i):
+    if type(Inn) != tuple:
+        return x_coded[Inn[i,:],:]
+    rfn = x_coded
+    M, d = rfn.M, rfn.index.d
+    out = np.zeros((M + 1, d), dtype='float32')
+    int_i = int(i)
+    rfn.get_neighbor_table(int_i, faiss.swig_ptr(out))
+    _, _, sq = Inn
+    return out[:, sq * rfn.dsub : (sq + 1) * rfn.dsub]
+
+
+# Function that produces the best regression values from the vector
+# and its neighbors
+def regress_from_neighbors (x, x_coded, Inn):
+    (N, knn) = get_Inn_shape(Inn)
+    betas = np.zeros((N,knn))
+    t0 = time.time()
+    for i in range (N):
+        xi = x[i,:]
+        NNi = get_neighbor_table(x_coded, Inn, i)
+        betas[i,:] = np.linalg.lstsq(NNi.transpose(), xi, rcond=0.01)[0]
+        if i % (N / 10) == 0:
+            print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
+    return betas
+
+
+
+# find the best beta minimizing ||x-x_coded[Inn,:]*beta||^2
+def regress_opt_beta (x, x_coded, Inn):
+    (N, knn) = get_Inn_shape(Inn)
+    d = x.shape[1]
+
+    # construct the linear system to be solved
+    X = np.zeros ((d*N))
+    Y = np.zeros ((d*N, knn))
+    for i in range (N):
+        X[i*d:(i+1)*d] = x[i,:]
+        neighbor_table = get_neighbor_table(x_coded, Inn, i)
+        Y[i*d:(i+1)*d, :] = neighbor_table.transpose()
+    beta_opt = np.linalg.lstsq(Y, X, rcond=0.01)[0]
+    return beta_opt
+
+
+# Find the best encoding by minimizing the reconstruction error using
+# a set of pre-computed beta values
+def assign_beta (beta_centroids, x, x_coded, Inn, verbose=True):
+    if type(Inn) == tuple:
+        return assign_beta_2(beta_centroids, x, x_coded, Inn)
+    (N, knn) = Inn.shape
+    x_ibeta = np.zeros ((N), dtype='int32')
+    t0= time.time()
+    for i in range (N):
+        NNi = x_coded[Inn[i,:]]
+        # Consider all possible betas for the encoding and compute the
+        # encoding error
+        x_reg_all = np.dot (beta_centroids, NNi)
+        err = ((x_reg_all - x[i,:]) ** 2).sum(axis=1)
+        x_ibeta[i] = err.argmin()
+        if verbose:
+            if i % (N / 10) == 0:
+                print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
+    return x_ibeta
+
+
+# Reconstruct a set of vectors using the beta_centroids, the
+# assignment, the encoded neighbors identified by the list Inn (which
+# includes the vector itself)
+def recons_from_neighbors (beta_centroids, x_ibeta, x_coded, Inn):
+    (N, knn) = Inn.shape
+    x_rec = np.zeros(x_coded.shape)
+    t0= time.time()
+    for i in range (N):
+        NNi = x_coded[Inn[i,:]]
+        x_rec[i, :] = np.dot (beta_centroids[x_ibeta[i]], NNi)
+        if i % (N / 10) == 0:
+            print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
+    return x_rec
+
+
+# Compute a EM-like algorithm trying at optimizing the beta such as they
+# minimize the reconstruction error from the neighbors
+def neighbors_kmeans (x, x_coded, Inn, K, ngpus=1, niter=5):
+    # First compute centroids using a regular k-means algorithm
+    betas = regress_from_neighbors (x, x_coded, Inn)
+    beta_centroids = train_kmeans(
+        sanitize(betas), K, ngpus, max_points_per_centroid=1000000)
+    _, knn = get_Inn_shape(Inn)
+    d = x.shape[1]
+
+    rs = np.random.RandomState()
+    for iter in range(niter):
+        print('iter', iter)
+        idx = assign_beta (beta_centroids, x, x_coded, Inn, verbose=False)
+
+        hist = np.bincount(idx)
+        for cl0 in np.where(hist == 0)[0]:
+            print("  cluster %d empty, split" % cl0, end=' ')
+            cl1 = idx[np.random.randint(idx.size)]
+            pos = np.nonzero (idx == cl1)[0]
+            pos = rs.choice(pos, pos.size / 2)
+            print("   cl %d -> %d + %d" % (cl1, len(pos), hist[cl1] - len(pos)))
+            idx[pos] = cl0
+            hist = np.bincount(idx)
+
+        tot_err = 0
+        for k in range (K):
+            pos = np.nonzero (idx == k)[0]
+            npos = pos.shape[0]
+
+            X = np.zeros (d*npos)
+            Y = np.zeros ((d*npos, knn))
+
+            for i in range(npos):
+                X[i*d:(i+1)*d] = x[pos[i],:]
+                neighbor_table = get_neighbor_table(x_coded, Inn, pos[i])
+                Y[i*d:(i+1)*d, :] = neighbor_table.transpose()
+            sol, residuals, _, _ = np.linalg.lstsq(Y, X, rcond=0.01)
+            if residuals.size > 0:
+                tot_err += residuals.sum()
+            beta_centroids[k, :] = sol
+        print('  err=%g' % tot_err)
+    return beta_centroids
+
+
+# assign the betas in C++
+def assign_beta_2(beta_centroids, x, rfn, Inn):
+    _, _, sq = Inn
+    if rfn.k == 1:
+        return np.zeros(x.shape[0], dtype=int)
+    # add dummy dimensions to beta_centroids and x
+    all_beta_centroids = np.zeros(
+        (rfn.nsq, rfn.k, rfn.M + 1), dtype='float32')
+    all_beta_centroids[sq] = beta_centroids
+    all_x = np.zeros((len(x), rfn.d), dtype='float32')
+    all_x[:, sq * rfn.dsub : (sq + 1) * rfn.dsub] = x
+    rfn.codes.clear()
+    rfn.ntotal = 0
+    faiss.copy_array_to_vector(
+        all_beta_centroids.ravel(), rfn.codebook)
+    rfn.add_codes(len(x), faiss.swig_ptr(all_x))
+    codes = faiss.vector_to_array(rfn.codes)
+    codes = codes.reshape(-1, rfn.nsq)
+    return codes[:, sq]
+
+
+#######################################################
+# For usage from bench_storages.py
+
+def train_beta_codebook(rfn, xb_full, niter=10):
+    beta_centroids = []
+    for sq in range(rfn.nsq):
+        d0, d1 = sq * rfn.dsub, (sq + 1) * rfn.dsub
+        print("training subquantizer %d/%d on dimensions %d:%d" % (
+            sq, rfn.nsq, d0, d1))
+        beta_centroids_i = neighbors_kmeans(
+            xb_full[:, d0:d1], rfn, (xb_full.shape[0], rfn.M + 1, sq),
+            rfn.k,
+            ngpus=0, niter=niter)
+        beta_centroids.append(beta_centroids_i)
+        rfn.ntotal = 0
+        rfn.codes.clear()
+        rfn.codebook.clear()
+    return np.stack(beta_centroids)
--- a/c_api/AutoTune_c.cpp
+++ b/c_api/AutoTune_c.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "AutoTune_c.h"
+#include <faiss/AutoTune.h>
+#include <cstring>
+#include "macros_impl.h"
+
+using faiss::Index;
+using faiss::ParameterRange;
+using faiss::ParameterSpace;
+
+const char* faiss_ParameterRange_name(const FaissParameterRange* range) {
+    return reinterpret_cast<const ParameterRange*>(range)->name.c_str();
+}
+
+void faiss_ParameterRange_values(
+        FaissParameterRange* range,
+        double** p_values,
+        size_t* p_size) {
+    auto& values = reinterpret_cast<ParameterRange*>(range)->values;
+    *p_values = values.data();
+    *p_size = values.size();
+}
+
+int faiss_ParameterSpace_new(FaissParameterSpace** space) {
+    try {
+        auto new_space = new ParameterSpace();
+        *space = reinterpret_cast<FaissParameterSpace*>(new_space);
+    }
+    CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(ParameterSpace)
+
+size_t faiss_ParameterSpace_n_combinations(const FaissParameterSpace* space) {
+    return reinterpret_cast<const ParameterSpace*>(space)->n_combinations();
+}
+
+int faiss_ParameterSpace_combination_name(
+        const FaissParameterSpace* space,
+        size_t cno,
+        char* char_buffer,
+        size_t size) {
+    try {
+        auto rep = reinterpret_cast<const ParameterSpace*>(space)
+                           ->combination_name(cno);
+        strncpy(char_buffer, rep.c_str(), size);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_ParameterSpace_set_index_parameters(
+        const FaissParameterSpace* space,
+        FaissIndex* cindex,
+        const char* param_string) {
+    try {
+        auto index = reinterpret_cast<Index*>(cindex);
+        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameters(
+                index, param_string);
+    }
+    CATCH_AND_HANDLE
+}
+
+/// set a combination of parameters on an index
+int faiss_ParameterSpace_set_index_parameters_cno(
+        const FaissParameterSpace* space,
+        FaissIndex* cindex,
+        size_t cno) {
+    try {
+        auto index = reinterpret_cast<Index*>(cindex);
+        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameters(
+                index, cno);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_ParameterSpace_set_index_parameter(
+        const FaissParameterSpace* space,
+        FaissIndex* cindex,
+        const char* name,
+        double value) {
+    try {
+        auto index = reinterpret_cast<Index*>(cindex);
+        reinterpret_cast<const ParameterSpace*>(space)->set_index_parameter(
+                index, name, value);
+    }
+    CATCH_AND_HANDLE
+}
+
+void faiss_ParameterSpace_display(const FaissParameterSpace* space) {
+    reinterpret_cast<const ParameterSpace*>(space)->display();
+}
+
+int faiss_ParameterSpace_add_range(
+        FaissParameterSpace* space,
+        const char* name,
+        FaissParameterRange** p_range) {
+    try {
+        ParameterRange& range =
+                reinterpret_cast<ParameterSpace*>(space)->add_range(name);
+        if (p_range) {
+            *p_range = reinterpret_cast<FaissParameterRange*>(&range);
+        }
+    }
+    CATCH_AND_HANDLE
+}
--- a/c_api/AutoTune_c.h
+++ b/c_api/AutoTune_c.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_AUTO_TUNE_C_H
+#define FAISS_AUTO_TUNE_C_H
+
+#include "Index_c.h"
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// possible values of a parameter, sorted from least to most expensive/accurate
+FAISS_DECLARE_CLASS(ParameterRange)
+
+FAISS_DECLARE_GETTER(ParameterRange, const char*, name)
+
+/// Getter for the values in the range. The output values are invalidated
+/// upon any other modification of the range.
+void faiss_ParameterRange_values(FaissParameterRange*, double**, size_t*);
+
+/** Uses a-priori knowledge on the Faiss indexes to extract tunable parameters.
+ */
+FAISS_DECLARE_CLASS(ParameterSpace)
+
+FAISS_DECLARE_DESTRUCTOR(ParameterSpace)
+
+/// Parameter space default constructor
+int faiss_ParameterSpace_new(FaissParameterSpace** space);
+
+/// nb of combinations, = product of values sizes
+size_t faiss_ParameterSpace_n_combinations(const FaissParameterSpace*);
+
+/// get string representation of the combination
+/// by writing it to the given character buffer.
+/// A buffer size of 1000 ensures that the full name is collected.
+int faiss_ParameterSpace_combination_name(
+        const FaissParameterSpace*,
+        size_t,
+        char*,
+        size_t);
+
+/// set a combination of parameters described by a string
+int faiss_ParameterSpace_set_index_parameters(
+        const FaissParameterSpace*,
+        FaissIndex*,
+        const char*);
+
+/// set a combination of parameters on an index
+int faiss_ParameterSpace_set_index_parameters_cno(
+        const FaissParameterSpace*,
+        FaissIndex*,
+        size_t);
+
+/// set one of the parameters
+int faiss_ParameterSpace_set_index_parameter(
+        const FaissParameterSpace*,
+        FaissIndex*,
+        const char*,
+        double);
+
+/// print a description on stdout
+void faiss_ParameterSpace_display(const FaissParameterSpace*);
+
+/// add a new parameter (or return it if it exists)
+int faiss_ParameterSpace_add_range(
+        FaissParameterSpace*,
+        const char*,
+        FaissParameterRange**);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/c_api/CMakeLists.txt
+++ b/c_api/CMakeLists.txt
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+
+project(faiss_c_library LANGUAGES C CXX)
+
+set(CMAKE_C_STANDARD 11)
+
+set(FAISS_C_SRC
+  AutoTune_c.cpp
+  Clustering_c.cpp
+  IndexFlat_c.cpp
+  IndexIVFFlat_c.cpp
+  IndexIVF_c.cpp
+  IndexLSH_c.cpp
+  IndexPreTransform_c.cpp
+  VectorTransform_c.cpp
+  IndexShards_c.cpp
+  IndexReplicas_c.cpp
+  Index_c.cpp
+  IndexScalarQuantizer_c.cpp
+  MetaIndexes_c.cpp
+  clone_index_c.cpp
+  error_impl.cpp
+  index_factory_c.cpp
+  index_io_c.cpp
+  impl/AuxIndexStructures_c.cpp
+  utils/distances_c.cpp
+)
+add_library(faiss_c ${FAISS_C_SRC})
+target_link_libraries(faiss_c PRIVATE faiss)
+
+function(faiss_install_headers headers p)
+  foreach(h ${headers})
+    get_filename_component(f ${h} DIRECTORY)
+    install(FILES ${h}
+      DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/${p}/${f}
+    )
+  endforeach()
+endfunction()
+
+file(GLOB FAISS_C_API_HEADERS
+     RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+     "*.h"
+     "impl/*.h"
+     "utils/*.h")
+
+faiss_install_headers("${FAISS_C_API_HEADERS}" c_api)
+
+add_executable(example_c EXCLUDE_FROM_ALL example_c.c)
+target_link_libraries(example_c PRIVATE faiss_c)
+
+if(FAISS_ENABLE_GPU)
+  add_subdirectory(gpu)
+endif()
--- a/c_api/Clustering_c.cpp
+++ b/c_api/Clustering_c.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "Clustering_c.h"
+#include <faiss/Clustering.h>
+#include <faiss/Index.h>
+#include <vector>
+#include "macros_impl.h"
+
+extern "C" {
+
+using faiss::Clustering;
+using faiss::ClusteringIterationStats;
+using faiss::ClusteringParameters;
+using faiss::Index;
+
+DEFINE_GETTER(Clustering, int, niter)
+DEFINE_GETTER(Clustering, int, nredo)
+DEFINE_GETTER(Clustering, int, verbose)
+DEFINE_GETTER(Clustering, int, spherical)
+DEFINE_GETTER(Clustering, int, int_centroids)
+DEFINE_GETTER(Clustering, int, update_index)
+DEFINE_GETTER(Clustering, int, frozen_centroids)
+
+DEFINE_GETTER(Clustering, int, min_points_per_centroid)
+DEFINE_GETTER(Clustering, int, max_points_per_centroid)
+
+DEFINE_GETTER(Clustering, int, seed)
+DEFINE_GETTER(Clustering, size_t, decode_block_size)
+
+/// getter for d
+DEFINE_GETTER(Clustering, size_t, d)
+
+/// getter for k
+DEFINE_GETTER(Clustering, size_t, k)
+
+DEFINE_GETTER(ClusteringIterationStats, float, obj)
+DEFINE_GETTER(ClusteringIterationStats, double, time)
+DEFINE_GETTER(ClusteringIterationStats, double, time_search)
+DEFINE_GETTER(ClusteringIterationStats, double, imbalance_factor)
+DEFINE_GETTER(ClusteringIterationStats, int, nsplit)
+
+void faiss_ClusteringParameters_init(FaissClusteringParameters* params) {
+    ClusteringParameters d;
+    params->frozen_centroids = d.frozen_centroids;
+    params->max_points_per_centroid = d.max_points_per_centroid;
+    params->min_points_per_centroid = d.min_points_per_centroid;
+    params->niter = d.niter;
+    params->nredo = d.nredo;
+    params->seed = d.seed;
+    params->spherical = d.spherical;
+    params->int_centroids = d.int_centroids;
+    params->update_index = d.update_index;
+    params->verbose = d.verbose;
+    params->decode_block_size = d.decode_block_size;
+}
+
+// This conversion is required because the two types are not memory-compatible
+inline ClusteringParameters from_faiss_c(
+        const FaissClusteringParameters* params) {
+    ClusteringParameters o;
+    o.frozen_centroids = params->frozen_centroids;
+    o.max_points_per_centroid = params->max_points_per_centroid;
+    o.min_points_per_centroid = params->min_points_per_centroid;
+    o.niter = params->niter;
+    o.nredo = params->nredo;
+    o.seed = params->seed;
+    o.spherical = params->spherical;
+    o.update_index = params->update_index;
+    o.int_centroids = params->int_centroids;
+    o.verbose = params->verbose;
+    o.decode_block_size = params->decode_block_size;
+    return o;
+}
+
+/// getter for centroids (size = k * d)
+void faiss_Clustering_centroids(
+        FaissClustering* clustering,
+        float** centroids,
+        size_t* size) {
+    std::vector<float>& v =
+            reinterpret_cast<Clustering*>(clustering)->centroids;
+    if (centroids) {
+        *centroids = v.data();
+    }
+    if (size) {
+        *size = v.size();
+    }
+}
+
+/// getter for iteration stats
+void faiss_Clustering_iteration_stats(
+        FaissClustering* clustering,
+        FaissClusteringIterationStats** iteration_stats,
+        size_t* size) {
+    std::vector<ClusteringIterationStats>& v =
+            reinterpret_cast<Clustering*>(clustering)->iteration_stats;
+    if (iteration_stats) {
+        *iteration_stats =
+                reinterpret_cast<FaissClusteringIterationStats*>(v.data());
+    }
+    if (size) {
+        *size = v.size();
+    }
+}
+
+/// the only mandatory parameters are k and d
+int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k) {
+    try {
+        Clustering* c = new Clustering(d, k);
+        *p_clustering = reinterpret_cast<FaissClustering*>(c);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_Clustering_new_with_params(
+        FaissClustering** p_clustering,
+        int d,
+        int k,
+        const FaissClusteringParameters* cp) {
+    try {
+        Clustering* c = new Clustering(d, k, from_faiss_c(cp));
+        *p_clustering = reinterpret_cast<FaissClustering*>(c);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+/// Index is used during the assignment stage
+int faiss_Clustering_train(
+        FaissClustering* clustering,
+        idx_t n,
+        const float* x,
+        FaissIndex* index) {
+    try {
+        reinterpret_cast<Clustering*>(clustering)
+                ->train(n, x, *reinterpret_cast<Index*>(index));
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+void faiss_Clustering_free(FaissClustering* clustering) {
+    delete reinterpret_cast<Clustering*>(clustering);
+}
+
+int faiss_kmeans_clustering(
+        size_t d,
+        size_t n,
+        size_t k,
+        const float* x,
+        float* centroids,
+        float* q_error) {
+    try {
+        float out = faiss::kmeans_clustering(d, n, k, x, centroids);
+        if (q_error) {
+            *q_error = out;
+        }
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+}
--- a/c_api/Clustering_c.h
+++ b/c_api/Clustering_c.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved
+// -*- c -*-
+
+#ifndef FAISS_CLUSTERING_C_H
+#define FAISS_CLUSTERING_C_H
+
+#include "Index_c.h"
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Class for the clustering parameters. Can be passed to the
+ * constructor of the Clustering object.
+ */
+typedef struct FaissClusteringParameters {
+    int niter; ///< clustering iterations
+    int nredo; ///< redo clustering this many times and keep best
+
+    int verbose;          ///< (bool)
+    int spherical;        ///< (bool) do we want normalized centroids?
+    int int_centroids;    ///< (bool) round centroids coordinates to integer
+    int update_index;     ///< (bool) update index after each iteration?
+    int frozen_centroids; ///< (bool) use the centroids provided as input and do
+                          ///< not change them during iterations
+
+    int min_points_per_centroid; ///< otherwise you get a warning
+    int max_points_per_centroid; ///< to limit size of dataset
+
+    int seed;                 ///< seed for the random number generator
+    size_t decode_block_size; ///< how many vectors at a time to decode
+} FaissClusteringParameters;
+
+/// Sets the ClusteringParameters object with reasonable defaults
+void faiss_ClusteringParameters_init(FaissClusteringParameters* params);
+
+/** clustering based on assignment - centroid update iterations
+ *
+ * The clustering is based on an Index object that assigns training
+ * points to the centroids. Therefore, at each iteration the centroids
+ * are added to the index.
+ *
+ * On output, the centroids table is set to the latest version
+ * of the centroids and they are also added to the index. If the
+ * centroids table it is not empty on input, it is also used for
+ * initialization.
+ *
+ * To do several clusterings, just call train() several times on
+ * different training sets, clearing the centroid table in between.
+ */
+FAISS_DECLARE_CLASS(Clustering)
+
+FAISS_DECLARE_GETTER(Clustering, int, niter)
+FAISS_DECLARE_GETTER(Clustering, int, nredo)
+FAISS_DECLARE_GETTER(Clustering, int, verbose)
+FAISS_DECLARE_GETTER(Clustering, int, spherical)
+FAISS_DECLARE_GETTER(Clustering, int, int_centroids)
+FAISS_DECLARE_GETTER(Clustering, int, update_index)
+FAISS_DECLARE_GETTER(Clustering, int, frozen_centroids)
+
+FAISS_DECLARE_GETTER(Clustering, int, min_points_per_centroid)
+FAISS_DECLARE_GETTER(Clustering, int, max_points_per_centroid)
+
+FAISS_DECLARE_GETTER(Clustering, int, seed)
+FAISS_DECLARE_GETTER(Clustering, size_t, decode_block_size)
+
+/// getter for d
+FAISS_DECLARE_GETTER(Clustering, size_t, d)
+
+/// getter for k
+FAISS_DECLARE_GETTER(Clustering, size_t, k)
+
+FAISS_DECLARE_CLASS(ClusteringIterationStats)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, float, obj)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time_search)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, imbalance_factor)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, int, nsplit)
+
+/// getter for centroids (size = k * d)
+void faiss_Clustering_centroids(
+        FaissClustering* clustering,
+        float** centroids,
+        size_t* size);
+
+/// getter for iteration stats
+void faiss_Clustering_iteration_stats(
+        FaissClustering* clustering,
+        FaissClusteringIterationStats** iteration_stats,
+        size_t* size);
+
+/// the only mandatory parameters are k and d
+int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k);
+
+int faiss_Clustering_new_with_params(
+        FaissClustering** p_clustering,
+        int d,
+        int k,
+        const FaissClusteringParameters* cp);
+
+int faiss_Clustering_train(
+        FaissClustering* clustering,
+        idx_t n,
+        const float* x,
+        FaissIndex* index);
+
+void faiss_Clustering_free(FaissClustering* clustering);
+
+/** simplified interface
+ *
+ * @param d dimension of the data
+ * @param n nb of training vectors
+ * @param k nb of output centroids
+ * @param x training set (size n * d)
+ * @param centroids output centroids (size k * d)
+ * @param q_error final quantization error
+ * @return error code
+ */
+int faiss_kmeans_clustering(
+        size_t d,
+        size_t n,
+        size_t k,
+        const float* x,
+        float* centroids,
+        float* q_error);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/c_api/INSTALL.md
+++ b/c_api/INSTALL.md
+Faiss C API
+===========
+
+Faiss provides a pure C interface, which can subsequently be used either in pure C programs or to produce bindings for programming languages with Foreign Function Interface (FFI) support. Although this is not required for the Python interface, some other programming languages (e.g. Rust and Julia) do not have SWIG support.
+
+Compilation instructions
+------------------------
+
+The full contents of the pure C API are in the ["c_api"](c_api/) folder.
+Please be sure to follow the instructions on [building the main C++ library](../INSTALL.md#step-1-compiling-the-c-faiss) first.
+Then, enter the [c_api](c_api/) directory and run
+
+  `make`
+
+This builds the dynamic library "faiss_c", containing the full implementation of Faiss and the necessary wrappers for the C interface. It does not depend on libfaiss.a or the C++ standard library. It will also build an example program `bin/example_c`.
+
+Using the API
+-------------
+
+The C API is composed of:
+
+- A set of C header files comprising the main Faiss interfaces, converted for use in C. Each file follows the format `«name»_c.h`, where `«name»` is the respective name from the C++ API. For example, the file [Index_c.h](./Index_c.h) file corresponds to the base `Index` API. Functions are declared with the `faiss_` prefix (e.g. `faiss_IndexFlat_new`), whereas new types have the `Faiss` prefix (e.g. `FaissIndex`, `FaissMetricType`, ...).
+- A dynamic library, compiled from the sources in the same folder, encloses the implementation of the library and wrapper functions.
+
+The index factory is available via the `faiss_index_factory` function in `AutoTune_c.h`:
+
+```c
+FaissIndex* index = NULL;
+int c = faiss_index_factory(&index, 64, "Flat", METRIC_L2);
+if (c) {
+    // operation failed
+}
+```
+
+Most operations that you would find as member functions are available with the format `faiss_«classname»_«member»`.
+
+```c
+idx_t ntotal = faiss_Index_ntotal(index);
+```
+
+Since this is C, the index needs to be freed manually in the end:
+
+```c
+faiss_Index_free(index);
+```
+
+Error handling is done by examining the error code returned by operations with recoverable errors.
+The code identifies the type of exception that rose from the implementation. Fetching the 
+corresponding error message can be done by calling the function `faiss_get_last_error()` from
+`error_c.h`. Getter functions and `free` functions do not return an error code.
+
+```c
+int c = faiss_Index_add(index, nb, xb);
+if (c) {
+    printf("%s", faiss_get_last_error());
+    exit(-1);
+}
+```
+
+An example is included, which is built automatically for the target `all`. It can also be built separately:
+
+  `make bin/example_c`
+
+Building with GPU support
+-------------------------
+
+For GPU support, a separate dynamic library in the "c_api/gpu" directory needs to be built.
+
+  `make`
+
+The "gpufaiss_c" dynamic library contains the GPU and CPU implementations of Faiss, which means that
+it can be used in place of "faiss_c". The same library will dynamically link with the CUDA runtime
+and cuBLAS.
+
+Using the GPU with the C API
+----------------------------
+
+A standard GPU resurces object can be obtained by the name `FaissStandardGpuResources`:
+
+```c
+FaissStandardGpuResources* gpu_res = NULL;
+int c = faiss_StandardGpuResources_new(&gpu_res);
+if (c) {
+    printf("%s", faiss_get_last_error());
+    exit(-1);
+}
+```
+
+Similarly to the C++ API, a CPU index can be converted to a GPU index:
+
+```c
+FaissIndex* cpu_index = NULL;
+int c = faiss_index_factory(&cpu_index, d, "Flat", METRIC_L2);
+if (c) { /* ... */ }
+FaissGpuIndex* gpu_index = NULL;
+c = faiss_index_cpu_to_gpu(gpu_res, 0, cpu_index, &gpu_index);
+if (c) { /* ... */ }
+```
+
+A more complete example is available by the name `bin/example_gpu_c`.
--- a/c_api/IndexFlat_c.cpp
+++ b/c_api/IndexFlat_c.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "IndexFlat_c.h"
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexRefine.h>
+#include "macros_impl.h"
+
+extern "C" {
+
+using faiss::Index;
+using faiss::IndexFlat;
+using faiss::IndexFlat1D;
+using faiss::IndexFlatIP;
+using faiss::IndexFlatL2;
+using faiss::IndexRefineFlat;
+
+DEFINE_DESTRUCTOR(IndexFlat)
+DEFINE_INDEX_DOWNCAST(IndexFlat)
+
+int faiss_IndexFlat_new(FaissIndexFlat** p_index) {
+    try {
+        *p_index = reinterpret_cast<FaissIndexFlat*>(new IndexFlat());
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlat_new_with(
+        FaissIndexFlat** p_index,
+        idx_t d,
+        FaissMetricType metric) {
+    try {
+        IndexFlat* index =
+                new IndexFlat(d, static_cast<faiss::MetricType>(metric));
+        *p_index = reinterpret_cast<FaissIndexFlat*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+void faiss_IndexFlat_xb(FaissIndexFlat* index, float** p_xb, size_t* p_size) {
+    IndexFlat* indexf = reinterpret_cast<IndexFlat*>(index);
+    *p_xb = indexf->get_xb();
+    if (p_size) {
+        *p_size = indexf->codes.size() / sizeof(float);
+    }
+}
+
+int faiss_IndexFlat_compute_distance_subset(
+        FaissIndex* index,
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        const idx_t* labels) {
+    try {
+        reinterpret_cast<IndexFlat*>(index)->compute_distance_subset(
+                n, x, k, distances, labels);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(IndexFlatIP)
+DEFINE_INDEX_DOWNCAST(IndexFlatIP)
+
+int faiss_IndexFlatIP_new(FaissIndexFlatIP** p_index) {
+    try {
+        IndexFlatIP* index = new IndexFlatIP();
+        *p_index = reinterpret_cast<FaissIndexFlatIP*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlatIP_new_with(FaissIndexFlatIP** p_index, idx_t d) {
+    try {
+        IndexFlatIP* index = new IndexFlatIP(d);
+        *p_index = reinterpret_cast<FaissIndexFlatIP*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(IndexFlatL2)
+DEFINE_INDEX_DOWNCAST(IndexFlatL2)
+
+int faiss_IndexFlatL2_new(FaissIndexFlatL2** p_index) {
+    try {
+        IndexFlatL2* index = new IndexFlatL2();
+        *p_index = reinterpret_cast<FaissIndexFlatL2*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlatL2_new_with(FaissIndexFlatL2** p_index, idx_t d) {
+    try {
+        IndexFlatL2* index = new IndexFlatL2(d);
+        *p_index = reinterpret_cast<FaissIndexFlatL2*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexRefineFlat_new(
+        FaissIndexRefineFlat** p_index,
+        FaissIndex* base_index) {
+    try {
+        IndexRefineFlat* index = new IndexRefineFlat(
+                reinterpret_cast<faiss::Index*>(base_index));
+        *p_index = reinterpret_cast<FaissIndexRefineFlat*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+DEFINE_DESTRUCTOR(IndexRefineFlat)
+DEFINE_INDEX_DOWNCAST(IndexRefineFlat)
+
+DEFINE_GETTER(IndexRefineFlat, int, own_fields)
+DEFINE_SETTER(IndexRefineFlat, int, own_fields)
+
+DEFINE_GETTER(IndexRefineFlat, float, k_factor)
+DEFINE_SETTER(IndexRefineFlat, float, k_factor)
+
+DEFINE_DESTRUCTOR(IndexFlat1D)
+DEFINE_INDEX_DOWNCAST(IndexFlat1D)
+
+int faiss_IndexFlat1D_new(FaissIndexFlat1D** p_index) {
+    try {
+        IndexFlat1D* index = new IndexFlat1D();
+        *p_index = reinterpret_cast<FaissIndexFlat1D*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlat1D_new_with(
+        FaissIndexFlat1D** p_index,
+        int continuous_update) {
+    try {
+        IndexFlat1D* index =
+                new IndexFlat1D(static_cast<bool>(continuous_update));
+        *p_index = reinterpret_cast<FaissIndexFlat1D*>(index);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexFlat1D_update_permutation(FaissIndexFlat1D* index) {
+    try {
+        reinterpret_cast<IndexFlat1D*>(index)->update_permutation();
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+}
--- a/c_api/IndexFlat_c.h
+++ b/c_api/IndexFlat_c.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved
+// -*- c -*-
+
+#ifndef FAISS_INDEX_FLAT_C_H
+#define FAISS_INDEX_FLAT_C_H
+
+#include "Index_c.h"
+#include "faiss_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// forward declaration
+typedef enum FaissMetricType FaissMetricType;
+
+/** Opaque type for IndexFlat */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlat, Index)
+
+int faiss_IndexFlat_new(FaissIndexFlat** p_index);
+
+int faiss_IndexFlat_new_with(
+        FaissIndexFlat** p_index,
+        idx_t d,
+        FaissMetricType metric);
+
+/** get a pointer to the index's internal data (the `xb` field). The outputs
+ * become invalid after any data addition or removal operation.
+ *
+ * @param index   opaque pointer to index object
+ * @param p_xb    output, the pointer to the beginning of `xb`.
+ * @param p_size  output, the current size of `sb` in number of float values.
+ */
+void faiss_IndexFlat_xb(FaissIndexFlat* index, float** p_xb, size_t* p_size);
+
+/** attempt a dynamic cast to a flat index, thus checking
+ * check whether the underlying index type is `IndexFlat`.
+ *
+ * @param index opaque pointer to index object
+ * @return the same pointer if the index is a flat index, NULL otherwise
+ */
+FAISS_DECLARE_INDEX_DOWNCAST(IndexFlat)
+
+FAISS_DECLARE_DESTRUCTOR(IndexFlat)
+
+/** compute distance with a subset of vectors
+ *
+ * @param index   opaque pointer to index object
+ * @param x       query vectors, size n * d
+ * @param labels  indices of the vectors that should be compared
+ *                for each query vector, size n * k
+ * @param distances
+ *                corresponding output distances, size n * k
+ */
+int faiss_IndexFlat_compute_distance_subset(
+        FaissIndex* index,
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        const idx_t* labels);
+
+/** Opaque type for IndexFlatIP */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlatIP, Index)
+
+FAISS_DECLARE_INDEX_DOWNCAST(IndexFlatIP)
+FAISS_DECLARE_DESTRUCTOR(IndexFlatIP)
+
+int faiss_IndexFlatIP_new(FaissIndexFlatIP** p_index);
+
+int faiss_IndexFlatIP_new_with(FaissIndexFlatIP** p_index, idx_t d);
+
+/** Opaque type for IndexFlatL2 */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlatL2, Index)
+
+FAISS_DECLARE_INDEX_DOWNCAST(IndexFlatL2)
+FAISS_DECLARE_DESTRUCTOR(IndexFlatL2)
+
+int faiss_IndexFlatL2_new(FaissIndexFlatL2** p_index);
+
+int faiss_IndexFlatL2_new_with(FaissIndexFlatL2** p_index, idx_t d);
+
+/** Opaque type for IndexRefineFlat
+ *
+ * Index that queries in a base_index (a fast one) and refines the
+ * results with an exact search, hopefully improving the results.
+ */
+FAISS_DECLARE_CLASS_INHERITED(IndexRefineFlat, Index)
+
+int faiss_IndexRefineFlat_new(
+        FaissIndexRefineFlat** p_index,
+        FaissIndex* base_index);
+
+FAISS_DECLARE_DESTRUCTOR(IndexRefineFlat)
+FAISS_DECLARE_INDEX_DOWNCAST(IndexRefineFlat)
+
+FAISS_DECLARE_GETTER_SETTER(IndexRefineFlat, int, own_fields)
+
+/// factor between k requested in search and the k requested from
+/// the base_index (should be >= 1)
+FAISS_DECLARE_GETTER_SETTER(IndexRefineFlat, float, k_factor)
+
+/** Opaque type for IndexFlat1D
+ *
+ * optimized version for 1D "vectors"
+ */
+FAISS_DECLARE_CLASS_INHERITED(IndexFlat1D, Index)
+
+FAISS_DECLARE_INDEX_DOWNCAST(IndexFlat1D)
+FAISS_DECLARE_DESTRUCTOR(IndexFlat1D)
+
+int faiss_IndexFlat1D_new(FaissIndexFlat1D** p_index);
+int faiss_IndexFlat1D_new_with(
+        FaissIndexFlat1D** p_index,
+        int continuous_update);
+
+int faiss_IndexFlat1D_update_permutation(FaissIndexFlat1D* index);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/c_api/IndexIVFFlat_c.cpp
+++ b/c_api/IndexIVFFlat_c.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "IndexIVFFlat_c.h"
+#include <faiss/IndexIVFFlat.h>
+#include "Clustering_c.h"
+#include "Index_c.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+using faiss::IndexIVFFlat;
+using faiss::MetricType;
+
+DEFINE_DESTRUCTOR(IndexIVFFlat)
+DEFINE_INDEX_DOWNCAST(IndexIVFFlat)
+
+/// number of possible key values
+DEFINE_GETTER(IndexIVFFlat, size_t, nlist)
+/// number of probes at query time
+DEFINE_GETTER(IndexIVFFlat, size_t, nprobe)
+DEFINE_SETTER(IndexIVFFlat, size_t, nprobe)
+
+/// quantizer that maps vectors to inverted lists
+DEFINE_GETTER_PERMISSIVE(IndexIVFFlat, FaissIndex*, quantizer)
+
+/**
+ * = 0: use the quantizer as index in a kmeans training
+ * = 1: just pass on the training set to the train() of the quantizer
+ * = 2: kmeans training on a flat index + add the centroids to the quantizer
+ */
+DEFINE_GETTER(IndexIVFFlat, char, quantizer_trains_alone)
+
+/// whether object owns the quantizer
+DEFINE_GETTER(IndexIVFFlat, int, own_fields)
+DEFINE_SETTER(IndexIVFFlat, int, own_fields)
+
+int faiss_IndexIVFFlat_new(FaissIndexIVFFlat** p_index) {
+    try {
+        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(new IndexIVFFlat());
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_new_with(
+        FaissIndexIVFFlat** p_index,
+        FaissIndex* quantizer,
+        size_t d,
+        size_t nlist) {
+    try {
+        auto q = reinterpret_cast<Index*>(quantizer);
+        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(
+                new IndexIVFFlat(q, d, nlist));
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_new_with_metric(
+        FaissIndexIVFFlat** p_index,
+        FaissIndex* quantizer,
+        size_t d,
+        size_t nlist,
+        FaissMetricType metric) {
+    try {
+        auto q = reinterpret_cast<Index*>(quantizer);
+        auto m = static_cast<MetricType>(metric);
+        *p_index = reinterpret_cast<FaissIndexIVFFlat*>(
+                new IndexIVFFlat(q, d, nlist, m));
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_add_core(
+        FaissIndexIVFFlat* index,
+        idx_t n,
+        const float* x,
+        const idx_t* xids,
+        const int64_t* precomputed_idx) {
+    try {
+        reinterpret_cast<IndexIVFFlat*>(index)->add_core(
+                n, x, xids, precomputed_idx);
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_IndexIVFFlat_update_vectors(
+        FaissIndexIVFFlat* index,
+        int nv,
+        idx_t* idx,
+        const float* v) {
+    try {
+        reinterpret_cast<IndexIVFFlat*>(index)->update_vectors(nv, idx, v);
+    }
+    CATCH_AND_HANDLE
+}