init the faiss for rocm

395d2ce6 · huchen · 5ded39f5 · 395d2ce6 · 395d2ce6 · 395d2ce6
Commit 395d2ce6 authored Jun 01, 2022 by huchen
20 changed files
--- a/faiss/IVFlib.cpp
+++ b/faiss/IVFlib.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IVFlib.h>
+#include <memory>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+namespace faiss {
+namespace ivflib {
+void check_compatible_for_merge(const Index* index0, const Index* index1) {
+    const faiss::IndexPreTransform* pt0 =
+            dynamic_cast<const faiss::IndexPreTransform*>(index0);
+    if (pt0) {
+        const faiss::IndexPreTransform* pt1 =
+                dynamic_cast<const faiss::IndexPreTransform*>(index1);
+        FAISS_THROW_IF_NOT_MSG(pt1, "both indexes should be pretransforms");
+        FAISS_THROW_IF_NOT(pt0->chain.size() == pt1->chain.size());
+        for (int i = 0; i < pt0->chain.size(); i++) {
+            FAISS_THROW_IF_NOT(typeid(pt0->chain[i]) == typeid(pt1->chain[i]));
+        }
+        index0 = pt0->index;
+        index1 = pt1->index;
+    }
+    FAISS_THROW_IF_NOT(typeid(index0) == typeid(index1));
+    FAISS_THROW_IF_NOT(
+            index0->d == index1->d &&
+            index0->metric_type == index1->metric_type);
+    const faiss::IndexIVF* ivf0 = dynamic_cast<const faiss::IndexIVF*>(index0);
+    if (ivf0) {
+        const faiss::IndexIVF* ivf1 =
+                dynamic_cast<const faiss::IndexIVF*>(index1);
+        FAISS_THROW_IF_NOT(ivf1);
+        ivf0->check_compatible_for_merge(*ivf1);
+    }
+    // TODO: check as thoroughfully for other index types
+}
+const IndexIVF* try_extract_index_ivf(const Index* index) {
+    if (auto* pt = dynamic_cast<const IndexPreTransform*>(index)) {
+        index = pt->index;
+    }
+    if (auto* idmap = dynamic_cast<const IndexIDMap*>(index)) {
+        index = idmap->index;
+    }
+    if (auto* idmap = dynamic_cast<const IndexIDMap2*>(index)) {
+        index = idmap->index;
+    }
+    auto* ivf = dynamic_cast<const IndexIVF*>(index);
+    return ivf;
+}
+IndexIVF* try_extract_index_ivf(Index* index) {
+    return const_cast<IndexIVF*>(try_extract_index_ivf((const Index*)(index)));
+}
+const IndexIVF* extract_index_ivf(const Index* index) {
+    const IndexIVF* ivf = try_extract_index_ivf(index);
+    FAISS_THROW_IF_NOT(ivf);
+    return ivf;
+}
+IndexIVF* extract_index_ivf(Index* index) {
+    return const_cast<IndexIVF*>(extract_index_ivf((const Index*)(index)));
+}
+void merge_into(faiss::Index* index0, faiss::Index* index1, bool shift_ids) {
+    check_compatible_for_merge(index0, index1);
+    IndexIVF* ivf0 = extract_index_ivf(index0);
+    IndexIVF* ivf1 = extract_index_ivf(index1);
+    ivf0->merge_from(*ivf1, shift_ids ? ivf0->ntotal : 0);
+    // useful for IndexPreTransform
+    index0->ntotal = ivf0->ntotal;
+    index1->ntotal = ivf1->ntotal;
+}
+void search_centroid(
+        faiss::Index* index,
+        const float* x,
+        int n,
+        idx_t* centroid_ids) {
+    std::unique_ptr<float[]> del;
+    if (auto index_pre = dynamic_cast<faiss::IndexPreTransform*>(index)) {
+        x = index_pre->apply_chain(n, x);
+        del.reset((float*)x);
+        index = index_pre->index;
+    }
+    faiss::IndexIVF* index_ivf = dynamic_cast<faiss::IndexIVF*>(index);
+    assert(index_ivf);
+    index_ivf->quantizer->assign(n, x, centroid_ids);
+}
+void search_and_return_centroids(
+        faiss::Index* index,
+        size_t n,
+        const float* xin,
+        long k,
+        float* distances,
+        idx_t* labels,
+        idx_t* query_centroid_ids,
+        idx_t* result_centroid_ids) {
+    const float* x = xin;
+    std::unique_ptr<float[]> del;
+    if (auto index_pre = dynamic_cast<faiss::IndexPreTransform*>(index)) {
+        x = index_pre->apply_chain(n, x);
+        del.reset((float*)x);
+        index = index_pre->index;
+    }
+    faiss::IndexIVF* index_ivf = dynamic_cast<faiss::IndexIVF*>(index);
+    assert(index_ivf);
+    size_t nprobe = index_ivf->nprobe;
+    std::vector<idx_t> cent_nos(n * nprobe);
+    std::vector<float> cent_dis(n * nprobe);
+    index_ivf->quantizer->search(
+            n, x, nprobe, cent_dis.data(), cent_nos.data());
+    if (query_centroid_ids) {
+        for (size_t i = 0; i < n; i++)
+            query_centroid_ids[i] = cent_nos[i * nprobe];
+    }
+    index_ivf->search_preassigned(
+            n, x, k, cent_nos.data(), cent_dis.data(), distances, labels, true);
+    for (size_t i = 0; i < n * k; i++) {
+        idx_t label = labels[i];
+        if (label < 0) {
+            if (result_centroid_ids)
+                result_centroid_ids[i] = -1;
+        } else {
+            long list_no = lo_listno(label);
+            long list_index = lo_offset(label);
+            if (result_centroid_ids)
+                result_centroid_ids[i] = list_no;
+            labels[i] = index_ivf->invlists->get_single_id(list_no, list_index);
+        }
+    }
+}
+SlidingIndexWindow::SlidingIndexWindow(Index* index) : index(index) {
+    n_slice = 0;
+    IndexIVF* index_ivf = const_cast<IndexIVF*>(extract_index_ivf(index));
+    ils = dynamic_cast<ArrayInvertedLists*>(index_ivf->invlists);
+    FAISS_THROW_IF_NOT_MSG(
+            ils, "only supports indexes with ArrayInvertedLists");
+    nlist = ils->nlist;
+    sizes.resize(nlist);
+}
+template <class T>
+static void shift_and_add(
+        std::vector<T>& dst,
+        size_t remove,
+        const std::vector<T>& src) {
+    if (remove > 0)
+        memmove(dst.data(),
+                dst.data() + remove,
+                (dst.size() - remove) * sizeof(T));
+    size_t insert_point = dst.size() - remove;
+    dst.resize(insert_point + src.size());
+    memcpy(dst.data() + insert_point, src.data(), src.size() * sizeof(T));
+}
+template <class T>
+static void remove_from_begin(std::vector<T>& v, size_t remove) {
+    if (remove > 0)
+        v.erase(v.begin(), v.begin() + remove);
+}
+void SlidingIndexWindow::step(const Index* sub_index, bool remove_oldest) {
+    FAISS_THROW_IF_NOT_MSG(
+            !remove_oldest || n_slice > 0,
+            "cannot remove slice: there is none");
+    const ArrayInvertedLists* ils2 = nullptr;
+    if (sub_index) {
+        check_compatible_for_merge(index, sub_index);
+        ils2 = dynamic_cast<const ArrayInvertedLists*>(
+                extract_index_ivf(sub_index)->invlists);
+        FAISS_THROW_IF_NOT_MSG(ils2, "supports only ArrayInvertedLists");
+    }
+    IndexIVF* index_ivf = extract_index_ivf(index);
+    if (remove_oldest && ils2) {
+        for (int i = 0; i < nlist; i++) {
+            std::vector<size_t>& sizesi = sizes[i];
+            size_t amount_to_remove = sizesi[0];
+            index_ivf->ntotal += ils2->ids[i].size() - amount_to_remove;
+            shift_and_add(ils->ids[i], amount_to_remove, ils2->ids[i]);
+            shift_and_add(
+                    ils->codes[i],
+                    amount_to_remove * ils->code_size,
+                    ils2->codes[i]);
+            for (int j = 0; j + 1 < n_slice; j++) {
+                sizesi[j] = sizesi[j + 1] - amount_to_remove;
+            }
+            sizesi[n_slice - 1] = ils->ids[i].size();
+        }
+    } else if (ils2) {
+        for (int i = 0; i < nlist; i++) {
+            index_ivf->ntotal += ils2->ids[i].size();
+            shift_and_add(ils->ids[i], 0, ils2->ids[i]);
+            shift_and_add(ils->codes[i], 0, ils2->codes[i]);
+            sizes[i].push_back(ils->ids[i].size());
+        }
+        n_slice++;
+    } else if (remove_oldest) {
+        for (int i = 0; i < nlist; i++) {
+            size_t amount_to_remove = sizes[i][0];
+            index_ivf->ntotal -= amount_to_remove;
+            remove_from_begin(ils->ids[i], amount_to_remove);
+            remove_from_begin(ils->codes[i], amount_to_remove * ils->code_size);
+            for (int j = 0; j + 1 < n_slice; j++) {
+                sizes[i][j] = sizes[i][j + 1] - amount_to_remove;
+            }
+            sizes[i].pop_back();
+        }
+        n_slice--;
+    } else {
+        FAISS_THROW_MSG("nothing to do???");
+    }
+    index->ntotal = index_ivf->ntotal;
+}
+// Get a subset of inverted lists [i0, i1). Works on IndexIVF's and
+// IndexIVF's embedded in a IndexPreTransform
+ArrayInvertedLists* get_invlist_range(const Index* index, long i0, long i1) {
+    const IndexIVF* ivf = extract_index_ivf(index);
+    FAISS_THROW_IF_NOT(0 <= i0 && i0 <= i1 && i1 <= ivf->nlist);
+    const InvertedLists* src = ivf->invlists;
+    ArrayInvertedLists* il = new ArrayInvertedLists(i1 - i0, src->code_size);
+    for (long i = i0; i < i1; i++) {
+        il->add_entries(
+                i - i0,
+                src->list_size(i),
+                InvertedLists::ScopedIds(src, i).get(),
+                InvertedLists::ScopedCodes(src, i).get());
+    }
+    return il;
+}
+void set_invlist_range(
+        Index* index,
+        long i0,
+        long i1,
+        ArrayInvertedLists* src) {
+    IndexIVF* ivf = extract_index_ivf(index);
+    FAISS_THROW_IF_NOT(0 <= i0 && i0 <= i1 && i1 <= ivf->nlist);
+    ArrayInvertedLists* dst = dynamic_cast<ArrayInvertedLists*>(ivf->invlists);
+    FAISS_THROW_IF_NOT_MSG(dst, "only ArrayInvertedLists supported");
+    FAISS_THROW_IF_NOT(
+            src->nlist == i1 - i0 && dst->code_size == src->code_size);
+    size_t ntotal = index->ntotal;
+    for (long i = i0; i < i1; i++) {
+        ntotal -= dst->list_size(i);
+        ntotal += src->list_size(i - i0);
+        std::swap(src->codes[i - i0], dst->codes[i]);
+        std::swap(src->ids[i - i0], dst->ids[i]);
+    }
+    ivf->ntotal = index->ntotal = ntotal;
+}
+static size_t count_ndis(
+        const IndexIVF* index_ivf,
+        size_t n_list_scan,
+        const idx_t* Iq) {
+    size_t nb_dis = 0;
+    const InvertedLists* il = index_ivf->invlists;
+    for (idx_t i = 0; i < n_list_scan; i++) {
+        if (Iq[i] >= 0) {
+            nb_dis += il->list_size(Iq[i]);
+        }
+    }
+    return nb_dis;
+}
+void search_with_parameters(
+        const Index* index,
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const IVFSearchParameters* params,
+        size_t* nb_dis_ptr,
+        double* ms_per_stage) {
+    FAISS_THROW_IF_NOT(params);
+    const float* prev_x = x;
+    ScopeDeleter<float> del;
+    double t0 = getmillisecs();
+    if (auto ip = dynamic_cast<const IndexPreTransform*>(index)) {
+        x = ip->apply_chain(n, x);
+        if (x != prev_x) {
+            del.set(x);
+        }
+        index = ip->index;
+    }
+    double t1 = getmillisecs();
+    std::vector<idx_t> Iq(params->nprobe * n);
+    std::vector<float> Dq(params->nprobe * n);
+    const IndexIVF* index_ivf = dynamic_cast<const IndexIVF*>(index);
+    FAISS_THROW_IF_NOT(index_ivf);
+    index_ivf->quantizer->search(n, x, params->nprobe, Dq.data(), Iq.data());
+    if (nb_dis_ptr) {
+        *nb_dis_ptr = count_ndis(index_ivf, n * params->nprobe, Iq.data());
+    }
+    double t2 = getmillisecs();
+    index_ivf->search_preassigned(
+            n, x, k, Iq.data(), Dq.data(), distances, labels, false, params);
+    double t3 = getmillisecs();
+    if (ms_per_stage) {
+        ms_per_stage[0] = t1 - t0;
+        ms_per_stage[1] = t2 - t1;
+        ms_per_stage[2] = t3 - t2;
+    }
+}
+void range_search_with_parameters(
+        const Index* index,
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result,
+        const IVFSearchParameters* params,
+        size_t* nb_dis_ptr,
+        double* ms_per_stage) {
+    FAISS_THROW_IF_NOT(params);
+    const float* prev_x = x;
+    ScopeDeleter<float> del;
+    double t0 = getmillisecs();
+    if (auto ip = dynamic_cast<const IndexPreTransform*>(index)) {
+        x = ip->apply_chain(n, x);
+        if (x != prev_x) {
+            del.set(x);
+        }
+        index = ip->index;
+    }
+    double t1 = getmillisecs();
+    std::vector<idx_t> Iq(params->nprobe * n);
+    std::vector<float> Dq(params->nprobe * n);
+    const IndexIVF* index_ivf = dynamic_cast<const IndexIVF*>(index);
+    FAISS_THROW_IF_NOT(index_ivf);
+    index_ivf->quantizer->search(n, x, params->nprobe, Dq.data(), Iq.data());
+    if (nb_dis_ptr) {
+        *nb_dis_ptr = count_ndis(index_ivf, n * params->nprobe, Iq.data());
+    }
+    double t2 = getmillisecs();
+    index_ivf->range_search_preassigned(
+            n, x, radius, Iq.data(), Dq.data(), result, false, params);
+    double t3 = getmillisecs();
+    if (ms_per_stage) {
+        ms_per_stage[0] = t1 - t0;
+        ms_per_stage[1] = t2 - t1;
+        ms_per_stage[2] = t3 - t2;
+    }
+}
+} // namespace ivflib
+} // namespace faiss
--- a/faiss/IVFlib.h
+++ b/faiss/IVFlib.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_IVFLIB_H
+#define FAISS_IVFLIB_H
+/** Since IVF (inverted file) indexes are of so much use for
+ * large-scale use cases, we group a few functions related to them in
+ * this small library. Most functions work both on IndexIVFs and
+ * IndexIVFs embedded within an IndexPreTransform.
+ */
+#include <faiss/IndexIVF.h>
+#include <vector>
+namespace faiss {
+namespace ivflib {
+/** check if two indexes have the same parameters and are trained in
+ * the same way, otherwise throw. */
+void check_compatible_for_merge(const Index* index1, const Index* index2);
+/** get an IndexIVF from an index. The index may be an IndexIVF or
+ * some wrapper class that encloses an IndexIVF
+ *
+ * throws an exception if this is not the case.
+ */
+const IndexIVF* extract_index_ivf(const Index* index);
+IndexIVF* extract_index_ivf(Index* index);
+/// same as above but returns nullptr instead of throwing on failure
+const IndexIVF* try_extract_index_ivf(const Index* index);
+IndexIVF* try_extract_index_ivf(Index* index);
+/** Merge index1 into index0. Works on IndexIVF's and IndexIVF's
+ *  embedded in a IndexPreTransform. On output, the index1 is empty.
+ *
+ * @param shift_ids: translate the ids from index1 to index0->prev_ntotal
+ */
+void merge_into(Index* index0, Index* index1, bool shift_ids);
+typedef Index::idx_t idx_t;
+/* Returns the cluster the embeddings belong to.
+ *
+ * @param index      Index, which should be an IVF index
+ *                   (otherwise there are no clusters)
+ * @param embeddings object descriptors for which the centroids should be found,
+ *                   size num_objects * d
+ * @param centroid_ids
+ *                   cluster id each object belongs to, size num_objects
+ */
+void search_centroid(Index* index, const float* x, int n, idx_t* centroid_ids);
+/* Returns the cluster the embeddings belong to.
+ *
+ * @param index      Index, which should be an IVF index
+ *                   (otherwise there are no clusters)
+ * @param query_centroid_ids
+ *                   centroid ids corresponding to the query vectors (size n)
+ * @param result_centroid_ids
+ *                   centroid ids corresponding to the results (size n * k)
+ * other arguments are the same as the standard search function
+ */
+void search_and_return_centroids(
+        Index* index,
+        size_t n,
+        const float* xin,
+        long k,
+        float* distances,
+        idx_t* labels,
+        idx_t* query_centroid_ids,
+        idx_t* result_centroid_ids);
+/** A set of IndexIVFs concatenated together in a FIFO fashion.
+ * at each "step", the oldest index slice is removed and a new index is added.
+ */
+struct SlidingIndexWindow {
+    /// common index that contains the sliding window
+    Index* index;
+    /// InvertedLists of index
+    ArrayInvertedLists* ils;
+    /// number of slices currently in index
+    int n_slice;
+    /// same as index->nlist
+    size_t nlist;
+    /// cumulative list sizes at each slice
+    std::vector<std::vector<size_t>> sizes;
+    /// index should be initially empty and trained
+    SlidingIndexWindow(Index* index);
+    /** Add one index to the current index and remove the oldest one.
+     *
+     * @param sub_index        slice to swap in (can be NULL)
+     * @param remove_oldest    if true, remove the oldest slices */
+    void step(const Index* sub_index, bool remove_oldest);
+};
+/// Get a subset of inverted lists [i0, i1)
+ArrayInvertedLists* get_invlist_range(const Index* index, long i0, long i1);
+/// Set a subset of inverted lists
+void set_invlist_range(Index* index, long i0, long i1, ArrayInvertedLists* src);
+/** search an IndexIVF, possibly embedded in an IndexPreTransform with
+ * given parameters. This is a way to set the nprobe and get
+ * statdistics in a thread-safe way.
+ *
+ * Optionally returns (if non-nullptr):
+ * - nb_dis: number of distances computed
+ * - ms_per_stage: [0]: preprocessing time
+ *                 [1]: coarse quantization,
+ *                 [2]: list scanning
+ */
+void search_with_parameters(
+        const Index* index,
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const IVFSearchParameters* params,
+        size_t* nb_dis = nullptr,
+        double* ms_per_stage = nullptr);
+/** same as search_with_parameters but for range search */
+void range_search_with_parameters(
+        const Index* index,
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result,
+        const IVFSearchParameters* params,
+        size_t* nb_dis = nullptr,
+        double* ms_per_stage = nullptr);
+} // namespace ivflib
+} // namespace faiss
+#endif
--- a/faiss/Index.cpp
+++ b/faiss/Index.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/Index.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+#include <cstring>
+namespace faiss {
+Index::~Index() {}
+void Index::train(idx_t /*n*/, const float* /*x*/) {
+    // does nothing by default
+}
+void Index::range_search(idx_t, const float*, float, RangeSearchResult*) const {
+    FAISS_THROW_MSG("range search not implemented");
+}
+void Index::assign(idx_t n, const float* x, idx_t* labels, idx_t k) const {
+    std::vector<float> distances(n * k);
+    search(n, x, k, distances.data(), labels);
+}
+void Index::add_with_ids(
+        idx_t /*n*/,
+        const float* /*x*/,
+        const idx_t* /*xids*/) {
+    FAISS_THROW_MSG("add_with_ids not implemented for this type of index");
+}
+size_t Index::remove_ids(const IDSelector& /*sel*/) {
+    FAISS_THROW_MSG("remove_ids not implemented for this type of index");
+    return -1;
+}
+void Index::reconstruct(idx_t, float*) const {
+    FAISS_THROW_MSG("reconstruct not implemented for this type of index");
+}
+void Index::reconstruct_n(idx_t i0, idx_t ni, float* recons) const {
+    for (idx_t i = 0; i < ni; i++) {
+        reconstruct(i0 + i, recons + i * d);
+    }
+}
+void Index::search_and_reconstruct(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        float* recons) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    search(n, x, k, distances, labels);
+    for (idx_t i = 0; i < n; ++i) {
+        for (idx_t j = 0; j < k; ++j) {
+            idx_t ij = i * k + j;
+            idx_t key = labels[ij];
+            float* reconstructed = recons + ij * d;
+            if (key < 0) {
+                // Fill with NaNs
+                memset(reconstructed, -1, sizeof(*reconstructed) * d);
+            } else {
+                reconstruct(key, reconstructed);
+            }
+        }
+    }
+}
+void Index::compute_residual(const float* x, float* residual, idx_t key) const {
+    reconstruct(key, residual);
+    for (size_t i = 0; i < d; i++) {
+        residual[i] = x[i] - residual[i];
+    }
+}
+void Index::compute_residual_n(
+        idx_t n,
+        const float* xs,
+        float* residuals,
+        const idx_t* keys) const {
+#pragma omp parallel for
+    for (idx_t i = 0; i < n; ++i) {
+        compute_residual(&xs[i * d], &residuals[i * d], keys[i]);
+    }
+}
+size_t Index::sa_code_size() const {
+    FAISS_THROW_MSG("standalone codec not implemented for this type of index");
+}
+void Index::sa_encode(idx_t, const float*, uint8_t*) const {
+    FAISS_THROW_MSG("standalone codec not implemented for this type of index");
+}
+void Index::sa_decode(idx_t, const uint8_t*, float*) const {
+    FAISS_THROW_MSG("standalone codec not implemented for this type of index");
+}
+namespace {
+// storage that explicitly reconstructs vectors before computing distances
+struct GenericDistanceComputer : DistanceComputer {
+    size_t d;
+    const Index& storage;
+    std::vector<float> buf;
+    const float* q;
+    explicit GenericDistanceComputer(const Index& storage) : storage(storage) {
+        d = storage.d;
+        buf.resize(d * 2);
+    }
+    float operator()(idx_t i) override {
+        storage.reconstruct(i, buf.data());
+        return fvec_L2sqr(q, buf.data(), d);
+    }
+    float symmetric_dis(idx_t i, idx_t j) override {
+        storage.reconstruct(i, buf.data());
+        storage.reconstruct(j, buf.data() + d);
+        return fvec_L2sqr(buf.data() + d, buf.data(), d);
+    }
+    void set_query(const float* x) override {
+        q = x;
+    }
+};
+} // namespace
+DistanceComputer* Index::get_distance_computer() const {
+    if (metric_type == METRIC_L2) {
+        return new GenericDistanceComputer(*this);
+    } else {
+        FAISS_THROW_MSG("get_distance_computer() not implemented");
+    }
+}
+} // namespace faiss
--- a/faiss/Index.h
+++ b/faiss/Index.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_INDEX_H
+#define FAISS_INDEX_H
+#include <faiss/MetricType.h>
+#include <cstdio>
+#include <sstream>
+#include <string>
+#include <typeinfo>
+#define FAISS_VERSION_MAJOR 1
+#define FAISS_VERSION_MINOR 7
+#define FAISS_VERSION_PATCH 2
+/**
+ * @namespace faiss
+ *
+ * Throughout the library, vectors are provided as float * pointers.
+ * Most algorithms can be optimized when several vectors are processed
+ * (added/searched) together in a batch. In this case, they are passed
+ * in as a matrix. When n vectors of size d are provided as float * x,
+ * component j of vector i is
+ *
+ *   x[ i * d + j ]
+ *
+ * where 0 <= i < n and 0 <= j < d. In other words, matrices are
+ * always compact. When specifying the size of the matrix, we call it
+ * an n*d matrix, which implies a row-major storage.
+ */
+namespace faiss {
+/// Forward declarations see AuxIndexStructures.h
+struct IDSelector;
+struct RangeSearchResult;
+struct DistanceComputer;
+/** Abstract structure for an index, supports adding vectors and searching them.
+ *
+ * All vectors provided at add or search time are 32-bit float arrays,
+ * although the internal representation may vary.
+ */
+struct Index {
+    using idx_t = int64_t; ///< all indices are this type
+    using component_t = float;
+    using distance_t = float;
+    int d;        ///< vector dimension
+    idx_t ntotal; ///< total nb of indexed vectors
+    bool verbose; ///< verbosity level
+    /// set if the Index does not require training, or if training is
+    /// done already
+    bool is_trained;
+    /// type of metric this index uses for search
+    MetricType metric_type;
+    float metric_arg; ///< argument of the metric type
+    explicit Index(idx_t d = 0, MetricType metric = METRIC_L2)
+            : d(d),
+              ntotal(0),
+              verbose(false),
+              is_trained(true),
+              metric_type(metric),
+              metric_arg(0) {}
+    virtual ~Index();
+    /** Perform training on a representative set of vectors
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d
+     */
+    virtual void train(idx_t n, const float* x);
+    /** Add n vectors of dimension d to the index.
+     *
+     * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
+     * This function slices the input vectors in chunks smaller than
+     * blocksize_add and calls add_core.
+     * @param x      input matrix, size n * d
+     */
+    virtual void add(idx_t n, const float* x) = 0;
+    /** Same as add, but stores xids instead of sequential ids.
+     *
+     * The default implementation fails with an assertion, as it is
+     * not supported by all indexes.
+     *
+     * @param xids if non-null, ids to store for the vectors (size n)
+     */
+    virtual void add_with_ids(idx_t n, const float* x, const idx_t* xids);
+    /** query n vectors of dimension d to the index.
+     *
+     * return at most k vectors. If there are not enough results for a
+     * query, the result array is padded with -1s.
+     *
+     * @param x           input vectors to search, size n * d
+     * @param labels      output labels of the NNs, size n*k
+     * @param distances   output pairwise distances, size n*k
+     */
+    virtual void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const = 0;
+    /** query n vectors of dimension d to the index.
+     *
+     * return all vectors with distance < radius. Note that many
+     * indexes do not implement the range_search (only the k-NN search
+     * is mandatory).
+     *
+     * @param x           input vectors to search, size n * d
+     * @param radius      search radius
+     * @param result      result table
+     */
+    virtual void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result) const;
+    /** return the indexes of the k vectors closest to the query x.
+     *
+     * This function is identical as search but only return labels of neighbors.
+     * @param x           input vectors to search, size n * d
+     * @param labels      output labels of the NNs, size n*k
+     */
+    virtual void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
+            const;
+    /// removes all elements from the database.
+    virtual void reset() = 0;
+    /** removes IDs from the index. Not supported by all
+     * indexes. Returns the number of elements removed.
+     */
+    virtual size_t remove_ids(const IDSelector& sel);
+    /** Reconstruct a stored vector (or an approximation if lossy coding)
+     *
+     * this function may not be defined for some indexes
+     * @param key         id of the vector to reconstruct
+     * @param recons      reconstucted vector (size d)
+     */
+    virtual void reconstruct(idx_t key, float* recons) const;
+    /** Reconstruct vectors i0 to i0 + ni - 1
+     *
+     * this function may not be defined for some indexes
+     * @param recons      reconstucted vector (size ni * d)
+     */
+    virtual void reconstruct_n(idx_t i0, idx_t ni, float* recons) const;
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * If there are not enough results for a query, the resulting arrays
+     * is padded with -1s.
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     **/
+    virtual void search_and_reconstruct(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            float* recons) const;
+    /** Computes a residual vector after indexing encoding.
+     *
+     * The residual vector is the difference between a vector and the
+     * reconstruction that can be decoded from its representation in
+     * the index. The residual can be used for multiple-stage indexing
+     * methods, like IndexIVF's methods.
+     *
+     * @param x           input vector, size d
+     * @param residual    output residual vector, size d
+     * @param key         encoded index, as returned by search and assign
+     */
+    virtual void compute_residual(const float* x, float* residual, idx_t key)
+            const;
+    /** Computes a residual vector after indexing encoding (batch form).
+     * Equivalent to calling compute_residual for each vector.
+     *
+     * The residual vector is the difference between a vector and the
+     * reconstruction that can be decoded from its representation in
+     * the index. The residual can be used for multiple-stage indexing
+     * methods, like IndexIVF's methods.
+     *
+     * @param n           number of vectors
+     * @param xs          input vectors, size (n x d)
+     * @param residuals   output residual vectors, size (n x d)
+     * @param keys        encoded index, as returned by search and assign
+     */
+    virtual void compute_residual_n(
+            idx_t n,
+            const float* xs,
+            float* residuals,
+            const idx_t* keys) const;
+    /** Get a DistanceComputer (defined in AuxIndexStructures) object
+     * for this kind of index.
+     *
+     * DistanceComputer is implemented for indexes that support random
+     * access of their vectors.
+     */
+    virtual DistanceComputer* get_distance_computer() const;
+    /* The standalone codec interface */
+    /** size of the produced codes in bytes */
+    virtual size_t sa_code_size() const;
+    /** encode a set of vectors
+     *
+     * @param n       number of vectors
+     * @param x       input vectors, size n * d
+     * @param bytes   output encoded vectors, size n * sa_code_size()
+     */
+    virtual void sa_encode(idx_t n, const float* x, uint8_t* bytes) const;
+    /** encode a set of vectors
+     *
+     * @param n       number of vectors
+     * @param bytes   input encoded vectors, size n * sa_code_size()
+     * @param x       output vectors, size n * d
+     */
+    virtual void sa_decode(idx_t n, const uint8_t* bytes, float* x) const;
+};
+} // namespace faiss
+#endif
--- a/faiss/Index2Layer.cpp
+++ b/faiss/Index2Layer.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/Index2Layer.h>
+#include <faiss/impl/platform_macros.h>
+#include <stdint.h>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#ifdef __SSE3__
+#include <immintrin.h>
+#endif
+#include <algorithm>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/utils.h>
+namespace faiss {
+/*************************************
+ * Index2Layer implementation
+ *************************************/
+Index2Layer::Index2Layer(
+        Index* quantizer,
+        size_t nlist,
+        int M,
+        int nbit,
+        MetricType metric)
+        : IndexFlatCodes(0, quantizer->d, metric),
+          q1(quantizer, nlist),
+          pq(quantizer->d, M, nbit) {
+    is_trained = false;
+    for (int nbyte = 0; nbyte < 7; nbyte++) {
+        if ((1L << (8 * nbyte)) >= nlist) {
+            code_size_1 = nbyte;
+            break;
+        }
+    }
+    code_size_2 = pq.code_size;
+    code_size = code_size_1 + code_size_2;
+}
+Index2Layer::Index2Layer() {
+    code_size = code_size_1 = code_size_2 = 0;
+}
+Index2Layer::~Index2Layer() {}
+void Index2Layer::train(idx_t n, const float* x) {
+    if (verbose) {
+        printf("training level-1 quantizer %" PRId64 " vectors in %dD\n", n, d);
+    }
+    q1.train_q1(n, x, verbose, metric_type);
+    if (verbose) {
+        printf("computing residuals\n");
+    }
+    const float* x_in = x;
+    x = fvecs_maybe_subsample(
+            d,
+            (size_t*)&n,
+            pq.cp.max_points_per_centroid * pq.ksub,
+            x,
+            verbose,
+            pq.cp.seed);
+    ScopeDeleter<float> del_x(x_in == x ? nullptr : x);
+    std::vector<idx_t> assign(n); // assignement to coarse centroids
+    q1.quantizer->assign(n, x, assign.data());
+    std::vector<float> residuals(n * d);
+    for (idx_t i = 0; i < n; i++) {
+        q1.quantizer->compute_residual(
+                x + i * d, residuals.data() + i * d, assign[i]);
+    }
+    if (verbose)
+        printf("training %zdx%zd product quantizer on %" PRId64
+               " vectors in %dD\n",
+               pq.M,
+               pq.ksub,
+               n,
+               d);
+    pq.verbose = verbose;
+    pq.train(n, residuals.data());
+    is_trained = true;
+}
+void Index2Layer::search(
+        idx_t /*n*/,
+        const float* /*x*/,
+        idx_t /*k*/,
+        float* /*distances*/,
+        idx_t* /*labels*/) const {
+    FAISS_THROW_MSG("not implemented");
+}
+void Index2Layer::transfer_to_IVFPQ(IndexIVFPQ& other) const {
+    FAISS_THROW_IF_NOT(other.nlist == q1.nlist);
+    FAISS_THROW_IF_NOT(other.code_size == code_size_2);
+    FAISS_THROW_IF_NOT(other.ntotal == 0);
+    const uint8_t* rp = codes.data();
+    for (idx_t i = 0; i < ntotal; i++) {
+        idx_t key = 0;
+        memcpy(&key, rp, code_size_1);
+        rp += code_size_1;
+        other.invlists->add_entry(key, i, rp);
+        rp += code_size_2;
+    }
+    other.ntotal = ntotal;
+}
+namespace {
+struct Distance2Level : DistanceComputer {
+    size_t d;
+    const Index2Layer& storage;
+    std::vector<float> buf;
+    const float* q;
+    const float *pq_l1_tab, *pq_l2_tab;
+    explicit Distance2Level(const Index2Layer& storage) : storage(storage) {
+        d = storage.d;
+        FAISS_ASSERT(storage.pq.dsub == 4);
+        pq_l2_tab = storage.pq.centroids.data();
+        buf.resize(2 * d);
+    }
+    float symmetric_dis(idx_t i, idx_t j) override {
+        storage.reconstruct(i, buf.data());
+        storage.reconstruct(j, buf.data() + d);
+        return fvec_L2sqr(buf.data() + d, buf.data(), d);
+    }
+    void set_query(const float* x) override {
+        q = x;
+    }
+};
+// well optimized for xNN+PQNN
+struct DistanceXPQ4 : Distance2Level {
+    int M, k;
+    explicit DistanceXPQ4(const Index2Layer& storage)
+            : Distance2Level(storage) {
+        const IndexFlat* quantizer =
+                dynamic_cast<IndexFlat*>(storage.q1.quantizer);
+        FAISS_ASSERT(quantizer);
+        M = storage.pq.M;
+        pq_l1_tab = quantizer->get_xb();
+    }
+    float operator()(idx_t i) override {
+#ifdef __SSE3__
+        const uint8_t* code = storage.codes.data() + i * storage.code_size;
+        long key = 0;
+        memcpy(&key, code, storage.code_size_1);
+        code += storage.code_size_1;
+        // walking pointers
+        const float* qa = q;
+        const __m128* l1_t = (const __m128*)(pq_l1_tab + d * key);
+        const __m128* pq_l2_t = (const __m128*)pq_l2_tab;
+        __m128 accu = _mm_setzero_ps();
+        for (int m = 0; m < M; m++) {
+            __m128 qi = _mm_loadu_ps(qa);
+            __m128 recons = _mm_add_ps(l1_t[m], pq_l2_t[*code++]);
+            __m128 diff = _mm_sub_ps(qi, recons);
+            accu = _mm_add_ps(accu, _mm_mul_ps(diff, diff));
+            pq_l2_t += 256;
+            qa += 4;
+        }
+        accu = _mm_hadd_ps(accu, accu);
+        accu = _mm_hadd_ps(accu, accu);
+        return _mm_cvtss_f32(accu);
+#else
+        FAISS_THROW_MSG("not implemented for non-x64 platforms");
+#endif
+    }
+};
+// well optimized for 2xNN+PQNN
+struct Distance2xXPQ4 : Distance2Level {
+    int M_2, mi_nbits;
+    explicit Distance2xXPQ4(const Index2Layer& storage)
+            : Distance2Level(storage) {
+        const MultiIndexQuantizer* mi =
+                dynamic_cast<MultiIndexQuantizer*>(storage.q1.quantizer);
+        FAISS_ASSERT(mi);
+        FAISS_ASSERT(storage.pq.M % 2 == 0);
+        M_2 = storage.pq.M / 2;
+        mi_nbits = mi->pq.nbits;
+        pq_l1_tab = mi->pq.centroids.data();
+    }
+    float operator()(idx_t i) override {
+        const uint8_t* code = storage.codes.data() + i * storage.code_size;
+        long key01 = 0;
+        memcpy(&key01, code, storage.code_size_1);
+        code += storage.code_size_1;
+#ifdef __SSE3__
+        // walking pointers
+        const float* qa = q;
+        const __m128* pq_l1_t = (const __m128*)pq_l1_tab;
+        const __m128* pq_l2_t = (const __m128*)pq_l2_tab;
+        __m128 accu = _mm_setzero_ps();
+        for (int mi_m = 0; mi_m < 2; mi_m++) {
+            long l1_idx = key01 & ((1L << mi_nbits) - 1);
+            const __m128* pq_l1 = pq_l1_t + M_2 * l1_idx;
+            for (int m = 0; m < M_2; m++) {
+                __m128 qi = _mm_loadu_ps(qa);
+                __m128 recons = _mm_add_ps(pq_l1[m], pq_l2_t[*code++]);
+                __m128 diff = _mm_sub_ps(qi, recons);
+                accu = _mm_add_ps(accu, _mm_mul_ps(diff, diff));
+                pq_l2_t += 256;
+                qa += 4;
+            }
+            pq_l1_t += M_2 << mi_nbits;
+            key01 >>= mi_nbits;
+        }
+        accu = _mm_hadd_ps(accu, accu);
+        accu = _mm_hadd_ps(accu, accu);
+        return _mm_cvtss_f32(accu);
+#else
+        FAISS_THROW_MSG("not implemented for non-x64 platforms");
+#endif
+    }
+};
+} // namespace
+DistanceComputer* Index2Layer::get_distance_computer() const {
+#ifdef __SSE3__
+    const MultiIndexQuantizer* mi =
+            dynamic_cast<MultiIndexQuantizer*>(q1.quantizer);
+    if (mi && pq.M % 2 == 0 && pq.dsub == 4) {
+        return new Distance2xXPQ4(*this);
+    }
+    const IndexFlat* fl = dynamic_cast<IndexFlat*>(q1.quantizer);
+    if (fl && pq.dsub == 4) {
+        return new DistanceXPQ4(*this);
+    }
+#endif
+    return Index::get_distance_computer();
+}
+/* The standalone codec interface */
+void Index2Layer::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
+    FAISS_THROW_IF_NOT(is_trained);
+    idx_t bs = 32768;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(i0 + bs, n);
+            if (verbose) {
+                printf("Index2Layer::add: adding %" PRId64 ":%" PRId64
+                       " / %" PRId64 "\n",
+                       i0,
+                       i1,
+                       n);
+            }
+            sa_encode(i1 - i0, x + i0 * d, bytes + i0 * code_size);
+        }
+        return;
+    }
+    std::unique_ptr<int64_t[]> list_nos(new int64_t[n]);
+    q1.quantizer->assign(n, x, list_nos.get());
+    std::vector<float> residuals(n * d);
+    for (idx_t i = 0; i < n; i++) {
+        q1.quantizer->compute_residual(
+                x + i * d, residuals.data() + i * d, list_nos[i]);
+    }
+    pq.compute_codes(residuals.data(), bytes, n);
+    for (idx_t i = n - 1; i >= 0; i--) {
+        uint8_t* code = bytes + i * code_size;
+        memmove(code + code_size_1, bytes + i * code_size_2, code_size_2);
+        q1.encode_listno(list_nos[i], code);
+    }
+}
+void Index2Layer::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
+#pragma omp parallel
+    {
+        std::vector<float> residual(d);
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const uint8_t* code = bytes + i * code_size;
+            int64_t list_no = q1.decode_listno(code);
+            float* xi = x + i * d;
+            pq.decode(code + code_size_1, xi);
+            q1.quantizer->reconstruct(list_no, residual.data());
+            for (int j = 0; j < d; j++) {
+                xi[j] += residual[j];
+            }
+        }
+    }
+}
+} // namespace faiss
--- a/faiss/Index2Layer.h
+++ b/faiss/Index2Layer.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#pragma once
+#include <vector>
+#include <faiss/IndexFlatCodes.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexPQ.h>
+namespace faiss {
+struct IndexIVFPQ;
+/** Same as an IndexIVFPQ without the inverted lists: codes are stored
+ * sequentially
+ *
+ * The class is mainly inteded to store encoded vectors that can be
+ * accessed randomly, the search function is not implemented.
+ */
+struct Index2Layer : IndexFlatCodes {
+    /// first level quantizer
+    Level1Quantizer q1;
+    /// second level quantizer is always a PQ
+    ProductQuantizer pq;
+    /// size of the code for the first level (ceil(log8(q1.nlist)))
+    size_t code_size_1;
+    /// size of the code for the second level
+    size_t code_size_2;
+    Index2Layer(
+            Index* quantizer,
+            size_t nlist,
+            int M,
+            int nbit = 8,
+            MetricType metric = METRIC_L2);
+    Index2Layer();
+    ~Index2Layer();
+    void train(idx_t n, const float* x) override;
+    /// not implemented
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+    DistanceComputer* get_distance_computer() const override;
+    /// transfer the flat codes to an IVFPQ index
+    void transfer_to_IVFPQ(IndexIVFPQ& other) const;
+    /* The standalone codec interface */
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+} // namespace faiss
--- a/faiss/IndexAdditiveQuantizer.cpp
+++ b/faiss/IndexAdditiveQuantizer.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// quiet the noise
+// clang-format off
+#include <faiss/IndexAdditiveQuantizer.h>
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/ResidualQuantizer.h>
+#include <faiss/impl/ResultHandler.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/extra_distances.h>
+#include <faiss/utils/utils.h>
+namespace faiss {
+/**************************************************************************************
+ * IndexAdditiveQuantizer
+ **************************************************************************************/
+IndexAdditiveQuantizer::IndexAdditiveQuantizer(
+            idx_t d,
+            AdditiveQuantizer* aq,
+            MetricType metric):
+        IndexFlatCodes(aq->code_size, d, metric), aq(aq)
+{
+    FAISS_THROW_IF_NOT(metric == METRIC_INNER_PRODUCT || metric == METRIC_L2);
+}
+namespace {
+template <class VectorDistance, class ResultHandler>
+void search_with_decompress(
+        const IndexAdditiveQuantizer& ir,
+        const float* xq,
+        VectorDistance& vd,
+        ResultHandler& res) {
+    const uint8_t* codes = ir.codes.data();
+    size_t ntotal = ir.ntotal;
+    size_t code_size = ir.code_size;
+    const AdditiveQuantizer *aq = ir.aq;
+    using SingleResultHandler = typename ResultHandler::SingleResultHandler;
+#pragma omp parallel for if(res.nq > 100)
+    for (int64_t q = 0; q < res.nq; q++) {
+        SingleResultHandler resi(res);
+        resi.begin(q);
+        std::vector<float> tmp(ir.d);
+        const float* x = xq + ir.d * q;
+        for (size_t i = 0; i < ntotal; i++) {
+            aq->decode(codes + i * code_size, tmp.data(), 1);
+            float dis = vd(x, tmp.data());
+            resi.add_result(dis, i);
+        }
+        resi.end();
+    }
+}
+template<bool is_IP, AdditiveQuantizer::Search_type_t st, class ResultHandler>
+void search_with_LUT(
+        const IndexAdditiveQuantizer& ir,
+        const float* xq,
+        ResultHandler& res)
+{
+    const AdditiveQuantizer & aq = *ir.aq;
+    const uint8_t* codes = ir.codes.data();
+    size_t ntotal = ir.ntotal;
+    size_t code_size = aq.code_size;
+    size_t nq = res.nq;
+    size_t d = ir.d;
+    using SingleResultHandler = typename ResultHandler::SingleResultHandler;
+    std::unique_ptr<float []> LUT(new float[nq * aq.total_codebook_size]);
+    aq.compute_LUT(nq, xq, LUT.get());
+#pragma omp parallel for if(nq > 100)
+    for (int64_t q = 0; q < nq; q++) {
+        SingleResultHandler resi(res);
+        resi.begin(q);
+        std::vector<float> tmp(aq.d);
+        const float *LUT_q = LUT.get() + aq.total_codebook_size * q;
+        float bias = 0;
+        if (!is_IP) { // the LUT function returns ||y||^2 - 2 * <x, y>, need to add ||x||^2
+            bias = fvec_norm_L2sqr(xq + q * d, d);
+        }
+        for (size_t i = 0; i < ntotal; i++) {
+            float dis = aq.compute_1_distance_LUT<is_IP, st>(
+                codes + i * code_size,
+                LUT_q
+            );
+            resi.add_result(dis + bias, i);
+        }
+        resi.end();
+    }
+}
+} // anonymous namespace
+void IndexAdditiveQuantizer::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    if (aq->search_type == AdditiveQuantizer::ST_decompress) {
+        if (metric_type == METRIC_L2) {
+            using VD = VectorDistance<METRIC_L2>;
+            VD vd = {size_t(d), metric_arg};
+            HeapResultHandler<VD::C> rh(n, distances, labels, k);
+            search_with_decompress(*this, x, vd, rh);
+        } else if (metric_type == METRIC_INNER_PRODUCT) {
+            using VD = VectorDistance<METRIC_INNER_PRODUCT>;
+            VD vd = {size_t(d), metric_arg};
+            HeapResultHandler<VD::C> rh(n, distances, labels, k);
+            search_with_decompress(*this, x, vd, rh);
+        }
+    } else {
+        if (metric_type == METRIC_INNER_PRODUCT) {
+            HeapResultHandler<CMin<float, idx_t> > rh(n, distances, labels, k);
+            search_with_LUT<true, AdditiveQuantizer::ST_LUT_nonorm> (*this, x, rh);
+        } else {
+            HeapResultHandler<CMax<float, idx_t> > rh(n, distances, labels, k);
+            if (aq->search_type == AdditiveQuantizer::ST_norm_float) {
+                search_with_LUT<false, AdditiveQuantizer::ST_norm_float> (*this, x, rh);
+            } else if (aq->search_type == AdditiveQuantizer::ST_LUT_nonorm) {
+                search_with_LUT<false, AdditiveQuantizer::ST_norm_float> (*this, x, rh);
+            } else if (aq->search_type == AdditiveQuantizer::ST_norm_qint8) {
+                search_with_LUT<false, AdditiveQuantizer::ST_norm_qint8> (*this, x, rh);
+            } else if (aq->search_type == AdditiveQuantizer::ST_norm_qint4) {
+                search_with_LUT<false, AdditiveQuantizer::ST_norm_qint4> (*this, x, rh);
+            } else if (aq->search_type == AdditiveQuantizer::ST_norm_cqint8) {
+                search_with_LUT<false, AdditiveQuantizer::ST_norm_cqint8> (*this, x, rh);
+            } else if (aq->search_type == AdditiveQuantizer::ST_norm_cqint4) {
+                search_with_LUT<false, AdditiveQuantizer::ST_norm_cqint4> (*this, x, rh);
+            } else {
+                FAISS_THROW_FMT("search type %d not supported", aq->search_type);
+            }
+        }
+    }
+}
+void IndexAdditiveQuantizer::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
+    return aq->compute_codes(x, bytes, n);
+}
+void IndexAdditiveQuantizer::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
+    return aq->decode(bytes, x, n);
+}
+/**************************************************************************************
+ * IndexResidualQuantizer
+ **************************************************************************************/
+IndexResidualQuantizer::IndexResidualQuantizer(
+        int d,        ///< dimensionality of the input vectors
+        size_t M,     ///< number of subquantizers
+        size_t nbits, ///< number of bit per subvector index
+        MetricType metric,
+        Search_type_t search_type)
+        : IndexResidualQuantizer(d, std::vector<size_t>(M, nbits), metric, search_type) {
+}
+IndexResidualQuantizer::IndexResidualQuantizer(
+        int d,
+        const std::vector<size_t>& nbits,
+        MetricType metric,
+        Search_type_t search_type)
+        : IndexAdditiveQuantizer(d, &rq, metric), rq(d, nbits, search_type) {
+    code_size = rq.code_size;
+    is_trained = false;
+}
+IndexResidualQuantizer::IndexResidualQuantizer() : IndexResidualQuantizer(0, 0, 0) {}
+void IndexResidualQuantizer::train(idx_t n, const float* x) {
+    rq.train(n, x);
+    is_trained = true;
+}
+/**************************************************************************************
+ * IndexLocalSearchQuantizer
+ **************************************************************************************/
+IndexLocalSearchQuantizer::IndexLocalSearchQuantizer(
+        int d,
+        size_t M,     ///< number of subquantizers
+        size_t nbits, ///< number of bit per subvector index
+        MetricType metric,
+        Search_type_t search_type)
+        : IndexAdditiveQuantizer(d, &lsq, metric), lsq(d, M, nbits, search_type) {
+    code_size = lsq.code_size;
+    is_trained = false;
+}
+IndexLocalSearchQuantizer::IndexLocalSearchQuantizer() : IndexLocalSearchQuantizer(0, 0, 0) {}
+void IndexLocalSearchQuantizer::train(idx_t n, const float* x) {
+    lsq.train(n, x);
+    is_trained = true;
+}
+/**************************************************************************************
+ * AdditiveCoarseQuantizer
+ **************************************************************************************/
+AdditiveCoarseQuantizer::AdditiveCoarseQuantizer(
+            idx_t d,
+            AdditiveQuantizer* aq,
+            MetricType metric):
+        Index(d, metric), aq(aq)
+{}
+void AdditiveCoarseQuantizer::add(idx_t, const float*) {
+    FAISS_THROW_MSG("not applicable");
+}
+void AdditiveCoarseQuantizer::reconstruct(idx_t key, float* recons) const {
+    aq->decode_64bit(key, recons);
+}
+void AdditiveCoarseQuantizer::reset() {
+    FAISS_THROW_MSG("not applicable");
+}
+void AdditiveCoarseQuantizer::train(idx_t n, const float* x) {
+    if (verbose) {
+        printf("AdditiveCoarseQuantizer::train: training on %zd vectors\n", size_t(n));
+    }
+    aq->train(n, x);
+    is_trained = true;
+    ntotal = (idx_t)1 << aq->tot_bits;
+    if (metric_type == METRIC_L2) {
+        if (verbose) {
+            printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n", size_t(ntotal));
+        }
+        // this is not necessary for the residualcoarsequantizer when
+        // using beam search. We'll see if the memory overhead is too high
+        centroid_norms.resize(ntotal);
+        aq->compute_centroid_norms(centroid_norms.data());
+    }
+}
+void AdditiveCoarseQuantizer::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        aq->knn_centroids_inner_product(n, x, k, distances, labels);
+    } else if (metric_type == METRIC_L2) {
+        FAISS_THROW_IF_NOT(centroid_norms.size() == ntotal);
+        aq->knn_centroids_L2(
+                n, x, k, distances, labels, centroid_norms.data());
+    }
+}
+/**************************************************************************************
+ * ResidualCoarseQuantizer
+ **************************************************************************************/
+ResidualCoarseQuantizer::ResidualCoarseQuantizer(
+        int d,        ///< dimensionality of the input vectors
+        const std::vector<size_t>& nbits,
+        MetricType metric)
+        : AdditiveCoarseQuantizer(d, &rq, metric), rq(d, nbits), beam_factor(4.0) {
+    FAISS_THROW_IF_NOT(rq.tot_bits <= 63);
+    is_trained = false;
+}
+ResidualCoarseQuantizer::ResidualCoarseQuantizer(
+        int d,
+        size_t M,     ///< number of subquantizers
+        size_t nbits, ///< number of bit per subvector index
+        MetricType metric)
+        : ResidualCoarseQuantizer(d, std::vector<size_t>(M, nbits), metric) {}
+ResidualCoarseQuantizer::ResidualCoarseQuantizer(): ResidualCoarseQuantizer(0, 0, 0) {}
+void ResidualCoarseQuantizer::set_beam_factor(float new_beam_factor) {
+    beam_factor = new_beam_factor;
+    if (new_beam_factor > 0) {
+        FAISS_THROW_IF_NOT(new_beam_factor >= 1.0);
+        return;
+    } else if (metric_type == METRIC_L2 && ntotal != centroid_norms.size()) {
+        if (verbose) {
+            printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n", size_t(ntotal));
+        }
+        centroid_norms.resize(ntotal);
+        aq->compute_centroid_norms(centroid_norms.data());
+    }
+}
+void ResidualCoarseQuantizer::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    if (beam_factor < 0) {
+        AdditiveCoarseQuantizer::search(n, x, k, distances, labels);
+        return;
+    }
+    int beam_size = int(k * beam_factor);
+    if (beam_size > ntotal) {
+        beam_size = ntotal;
+    }
+    size_t memory_per_point = rq.memory_per_point(beam_size);
+    /*
+    printf("mem per point %ld n=%d max_mem_distance=%ld mem_kb=%zd\n",
+        memory_per_point, int(n), rq.max_mem_distances, get_mem_usage_kb());
+    */
+    if (n > 1 && memory_per_point * n > rq.max_mem_distances) {
+        // then split queries to reduce temp memory
+        idx_t bs = rq.max_mem_distances / memory_per_point;
+        if (bs == 0) {
+            bs = 1; // otherwise we can't do much
+        }
+        if (verbose) {
+            printf("ResidualCoarseQuantizer::search: run %d searches in batches of size %d\n",
+                   int(n),
+                   int(bs));
+        }
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(n, i0 + bs);
+            search(i1 - i0, x + i0 * d, k, distances + i0 * k, labels + i0 * k);
+            InterruptCallback::check();
+        }
+        return;
+    }
+    std::vector<int32_t> codes(beam_size * rq.M * n);
+    std::vector<float> beam_distances(n * beam_size);
+    rq.refine_beam(
+            n, 1, x, beam_size, codes.data(), nullptr, beam_distances.data());
+#pragma omp parallel for if (n > 4000)
+    for (idx_t i = 0; i < n; i++) {
+        memcpy(distances + i * k,
+               beam_distances.data() + beam_size * i,
+               k * sizeof(distances[0]));
+        const int32_t* codes_i = codes.data() + beam_size * i * rq.M;
+        for (idx_t j = 0; j < k; j++) {
+            idx_t l = 0;
+            int shift = 0;
+            for (int m = 0; m < rq.M; m++) {
+                l |= (*codes_i++) << shift;
+                shift += rq.nbits[m];
+            }
+            labels[i * k + j] = l;
+        }
+    }
+}
+/**************************************************************************************
+ * LocalSearchCoarseQuantizer
+ **************************************************************************************/
+LocalSearchCoarseQuantizer::LocalSearchCoarseQuantizer(
+        int d,        ///< dimensionality of the input vectors
+        size_t M,     ///< number of subquantizers
+        size_t nbits, ///< number of bit per subvector index
+        MetricType metric)
+        : AdditiveCoarseQuantizer(d, &lsq, metric), lsq(d, M, nbits) {
+    FAISS_THROW_IF_NOT(lsq.tot_bits <= 63);
+    is_trained = false;
+}
+LocalSearchCoarseQuantizer::LocalSearchCoarseQuantizer() {
+    aq = &lsq;
+}
+} // namespace faiss
--- a/faiss/IndexAdditiveQuantizer.h
+++ b/faiss/IndexAdditiveQuantizer.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#ifndef FAISS_INDEX_ADDITIVE_QUANTIZER_H
+#define FAISS_INDEX_ADDITIVE_QUANTIZER_H
+#include <faiss/impl/AdditiveQuantizer.h>
+#include <cstdint>
+#include <vector>
+#include <faiss/IndexFlatCodes.h>
+#include <faiss/impl/LocalSearchQuantizer.h>
+#include <faiss/impl/ResidualQuantizer.h>
+#include <faiss/impl/platform_macros.h>
+namespace faiss {
+/// Abstract class for additive quantizers. The search functions are in common.
+struct IndexAdditiveQuantizer : IndexFlatCodes {
+    // the quantizer, this points to the relevant field in the inheriting
+    // classes
+    AdditiveQuantizer* aq;
+    using Search_type_t = AdditiveQuantizer::Search_type_t;
+    explicit IndexAdditiveQuantizer(
+            idx_t d = 0,
+            AdditiveQuantizer* aq = nullptr,
+            MetricType metric = METRIC_L2);
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+    /* The standalone codec interface */
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+/** Index based on a residual quantizer. Stored vectors are
+ * approximated by residual quantization codes.
+ * Can also be used as a codec
+ */
+struct IndexResidualQuantizer : IndexAdditiveQuantizer {
+    /// The residual quantizer used to encode the vectors
+    ResidualQuantizer rq;
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexResidualQuantizer(
+            int d,        ///< dimensionality of the input vectors
+            size_t M,     ///< number of subquantizers
+            size_t nbits, ///< number of bit per subvector index
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+    IndexResidualQuantizer(
+            int d,
+            const std::vector<size_t>& nbits,
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+    IndexResidualQuantizer();
+    void train(idx_t n, const float* x) override;
+};
+struct IndexLocalSearchQuantizer : IndexAdditiveQuantizer {
+    LocalSearchQuantizer lsq;
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexLocalSearchQuantizer(
+            int d,        ///< dimensionality of the input vectors
+            size_t M,     ///< number of subquantizers
+            size_t nbits, ///< number of bit per subvector index
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+    IndexLocalSearchQuantizer();
+    void train(idx_t n, const float* x) override;
+};
+/** A "virtual" index where the elements are the residual quantizer centroids.
+ *
+ * Intended for use as a coarse quantizer in an IndexIVF.
+ */
+struct AdditiveCoarseQuantizer : Index {
+    AdditiveQuantizer* aq;
+    explicit AdditiveCoarseQuantizer(
+            idx_t d = 0,
+            AdditiveQuantizer* aq = nullptr,
+            MetricType metric = METRIC_L2);
+    /// norms of centroids, useful for knn-search
+    std::vector<float> centroid_norms;
+    /// N/A
+    void add(idx_t n, const float* x) override;
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+    void reconstruct(idx_t key, float* recons) const override;
+    void train(idx_t n, const float* x) override;
+    /// N/A
+    void reset() override;
+};
+/** The ResidualCoarseQuantizer is a bit specialized compared to the
+ * default AdditiveCoarseQuantizer because it can use a beam search
+ * at search time (slow but may be useful for very large vocabularies) */
+struct ResidualCoarseQuantizer : AdditiveCoarseQuantizer {
+    /// The residual quantizer used to encode the vectors
+    ResidualQuantizer rq;
+    /// factor between the beam size and the search k
+    /// if negative, use exact search-to-centroid
+    float beam_factor;
+    /// computes centroid norms if required
+    void set_beam_factor(float new_beam_factor);
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    ResidualCoarseQuantizer(
+            int d,        ///< dimensionality of the input vectors
+            size_t M,     ///< number of subquantizers
+            size_t nbits, ///< number of bit per subvector index
+            MetricType metric = METRIC_L2);
+    ResidualCoarseQuantizer(
+            int d,
+            const std::vector<size_t>& nbits,
+            MetricType metric = METRIC_L2);
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+    ResidualCoarseQuantizer();
+};
+struct LocalSearchCoarseQuantizer : AdditiveCoarseQuantizer {
+    /// The residual quantizer used to encode the vectors
+    LocalSearchQuantizer lsq;
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    LocalSearchCoarseQuantizer(
+            int d,        ///< dimensionality of the input vectors
+            size_t M,     ///< number of subquantizers
+            size_t nbits, ///< number of bit per subvector index
+            MetricType metric = METRIC_L2);
+    LocalSearchCoarseQuantizer();
+};
+} // namespace faiss
+#endif
--- a/faiss/IndexBinary.cpp
+++ b/faiss/IndexBinary.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexBinary.h>
+#include <faiss/impl/FaissAssert.h>
+#include <cinttypes>
+#include <cstring>
+namespace faiss {
+IndexBinary::~IndexBinary() {}
+void IndexBinary::train(idx_t, const uint8_t*) {
+    // Does nothing by default.
+}
+void IndexBinary::range_search(idx_t, const uint8_t*, int, RangeSearchResult*)
+        const {
+    FAISS_THROW_MSG("range search not implemented");
+}
+void IndexBinary::assign(idx_t n, const uint8_t* x, idx_t* labels, idx_t k)
+        const {
+    std::vector<int> distances(n * k);
+    search(n, x, k, distances.data(), labels);
+}
+void IndexBinary::add_with_ids(idx_t, const uint8_t*, const idx_t*) {
+    FAISS_THROW_MSG("add_with_ids not implemented for this type of index");
+}
+size_t IndexBinary::remove_ids(const IDSelector&) {
+    FAISS_THROW_MSG("remove_ids not implemented for this type of index");
+    return 0;
+}
+void IndexBinary::reconstruct(idx_t, uint8_t*) const {
+    FAISS_THROW_MSG("reconstruct not implemented for this type of index");
+}
+void IndexBinary::reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const {
+    for (idx_t i = 0; i < ni; i++) {
+        reconstruct(i0 + i, recons + i * d);
+    }
+}
+void IndexBinary::search_and_reconstruct(
+        idx_t n,
+        const uint8_t* x,
+        idx_t k,
+        int32_t* distances,
+        idx_t* labels,
+        uint8_t* recons) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    search(n, x, k, distances, labels);
+    for (idx_t i = 0; i < n; ++i) {
+        for (idx_t j = 0; j < k; ++j) {
+            idx_t ij = i * k + j;
+            idx_t key = labels[ij];
+            uint8_t* reconstructed = recons + ij * d;
+            if (key < 0) {
+                // Fill with NaNs
+                memset(reconstructed, -1, sizeof(*reconstructed) * d);
+            } else {
+                reconstruct(key, reconstructed);
+            }
+        }
+    }
+}
+void IndexBinary::display() const {
+    printf("Index: %s  -> %" PRId64 " elements\n",
+           typeid(*this).name(),
+           ntotal);
+}
+} // namespace faiss
--- a/faiss/IndexBinary.h
+++ b/faiss/IndexBinary.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_INDEX_BINARY_H
+#define FAISS_INDEX_BINARY_H
+#include <cstdio>
+#include <sstream>
+#include <string>
+#include <typeinfo>
+#include <faiss/Index.h>
+#include <faiss/impl/FaissAssert.h>
+namespace faiss {
+/// Forward declarations see AuxIndexStructures.h
+struct IDSelector;
+struct RangeSearchResult;
+/** Abstract structure for a binary index.
+ *
+ * Supports adding vertices and searching them.
+ *
+ * All queries are symmetric because there is no distinction between codes and
+ * vectors.
+ */
+struct IndexBinary {
+    using idx_t = Index::idx_t; ///< all indices are this type
+    using component_t = uint8_t;
+    using distance_t = int32_t;
+    int d;         ///< vector dimension
+    int code_size; ///< number of bytes per vector ( = d / 8 )
+    idx_t ntotal;  ///< total nb of indexed vectors
+    bool verbose;  ///< verbosity level
+    /// set if the Index does not require training, or if training is done
+    /// already
+    bool is_trained;
+    /// type of metric this index uses for search
+    MetricType metric_type;
+    explicit IndexBinary(idx_t d = 0, MetricType metric = METRIC_L2)
+            : d(d),
+              code_size(d / 8),
+              ntotal(0),
+              verbose(false),
+              is_trained(true),
+              metric_type(metric) {
+        FAISS_THROW_IF_NOT(d % 8 == 0);
+    }
+    virtual ~IndexBinary();
+    /** Perform training on a representative set of vectors.
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d / 8
+     */
+    virtual void train(idx_t n, const uint8_t* x);
+    /** Add n vectors of dimension d to the index.
+     *
+     * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
+     * @param x      input matrix, size n * d / 8
+     */
+    virtual void add(idx_t n, const uint8_t* x) = 0;
+    /** Same as add, but stores xids instead of sequential ids.
+     *
+     * The default implementation fails with an assertion, as it is
+     * not supported by all indexes.
+     *
+     * @param xids if non-null, ids to store for the vectors (size n)
+     */
+    virtual void add_with_ids(idx_t n, const uint8_t* x, const idx_t* xids);
+    /** Query n vectors of dimension d to the index.
+     *
+     * return at most k vectors. If there are not enough results for a
+     * query, the result array is padded with -1s.
+     *
+     * @param x           input vectors to search, size n * d / 8
+     * @param labels      output labels of the NNs, size n*k
+     * @param distances   output pairwise distances, size n*k
+     */
+    virtual void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const = 0;
+    /** Query n vectors of dimension d to the index.
+     *
+     * return all vectors with distance < radius. Note that many indexes
+     * do not implement the range_search (only the k-NN search is
+     * mandatory). The distances are converted to float to reuse the
+     * RangeSearchResult structure, but they are integer. By convention,
+     * only distances < radius (strict comparison) are returned,
+     * ie. radius = 0 does not return any result and 1 returns only
+     * exact same vectors.
+     *
+     * @param x           input vectors to search, size n * d / 8
+     * @param radius      search radius
+     * @param result      result table
+     */
+    virtual void range_search(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            RangeSearchResult* result) const;
+    /** Return the indexes of the k vectors closest to the query x.
+     *
+     * This function is identical to search but only returns labels of
+     * neighbors.
+     * @param x           input vectors to search, size n * d / 8
+     * @param labels      output labels of the NNs, size n*k
+     */
+    void assign(idx_t n, const uint8_t* x, idx_t* labels, idx_t k = 1) const;
+    /// Removes all elements from the database.
+    virtual void reset() = 0;
+    /** Removes IDs from the index. Not supported by all indexes.
+     */
+    virtual size_t remove_ids(const IDSelector& sel);
+    /** Reconstruct a stored vector.
+     *
+     * This function may not be defined for some indexes.
+     * @param key         id of the vector to reconstruct
+     * @param recons      reconstucted vector (size d / 8)
+     */
+    virtual void reconstruct(idx_t key, uint8_t* recons) const;
+    /** Reconstruct vectors i0 to i0 + ni - 1.
+     *
+     * This function may not be defined for some indexes.
+     * @param recons      reconstucted vectors (size ni * d / 8)
+     */
+    virtual void reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const;
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * If there are not enough results for a query, the resulting array
+     * is padded with -1s.
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     **/
+    virtual void search_and_reconstruct(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels,
+            uint8_t* recons) const;
+    /** Display the actual class name and some more info. */
+    void display() const;
+};
+} // namespace faiss
+#endif // FAISS_INDEX_BINARY_H
--- a/faiss/IndexBinaryFlat.cpp
+++ b/faiss/IndexBinaryFlat.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+#include <cstring>
+namespace faiss {
+IndexBinaryFlat::IndexBinaryFlat(idx_t d) : IndexBinary(d) {}
+void IndexBinaryFlat::add(idx_t n, const uint8_t* x) {
+    xb.insert(xb.end(), x, x + n * code_size);
+    ntotal += n;
+}
+void IndexBinaryFlat::reset() {
+    xb.clear();
+    ntotal = 0;
+}
+void IndexBinaryFlat::search(
+        idx_t n,
+        const uint8_t* x,
+        idx_t k,
+        int32_t* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    const idx_t block_size = query_batch_size;
+    for (idx_t s = 0; s < n; s += block_size) {
+        idx_t nn = block_size;
+        if (s + block_size > n) {
+            nn = n - s;
+        }
+        if (use_heap) {
+            // We see the distances and labels as heaps.
+            int_maxheap_array_t res = {
+                    size_t(nn), size_t(k), labels + s * k, distances + s * k};
+            hammings_knn_hc(
+                    &res,
+                    x + s * code_size,
+                    xb.data(),
+                    ntotal,
+                    code_size,
+                    /* ordered = */ true);
+        } else {
+            hammings_knn_mc(
+                    x + s * code_size,
+                    xb.data(),
+                    nn,
+                    ntotal,
+                    k,
+                    code_size,
+                    distances + s * k,
+                    labels + s * k);
+        }
+    }
+}
+size_t IndexBinaryFlat::remove_ids(const IDSelector& sel) {
+    idx_t j = 0;
+    for (idx_t i = 0; i < ntotal; i++) {
+        if (sel.is_member(i)) {
+            // should be removed
+        } else {
+            if (i > j) {
+                memmove(&xb[code_size * j],
+                        &xb[code_size * i],
+                        sizeof(xb[0]) * code_size);
+            }
+            j++;
+        }
+    }
+    long nremove = ntotal - j;
+    if (nremove > 0) {
+        ntotal = j;
+        xb.resize(ntotal * code_size);
+    }
+    return nremove;
+}
+void IndexBinaryFlat::reconstruct(idx_t key, uint8_t* recons) const {
+    memcpy(recons, &(xb[code_size * key]), sizeof(*recons) * code_size);
+}
+void IndexBinaryFlat::range_search(
+        idx_t n,
+        const uint8_t* x,
+        int radius,
+        RangeSearchResult* result) const {
+    hamming_range_search(x, xb.data(), n, ntotal, radius, code_size, result);
+}
+} // namespace faiss
--- a/faiss/IndexBinaryFlat.h
+++ b/faiss/IndexBinaryFlat.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef INDEX_BINARY_FLAT_H
+#define INDEX_BINARY_FLAT_H
+#include <vector>
+#include <faiss/IndexBinary.h>
+namespace faiss {
+/** Index that stores the full vectors and performs exhaustive search. */
+struct IndexBinaryFlat : IndexBinary {
+    /// database vectors, size ntotal * d / 8
+    std::vector<uint8_t> xb;
+    /** Select between using a heap or counting to select the k smallest values
+     * when scanning inverted lists.
+     */
+    bool use_heap = true;
+    size_t query_batch_size = 32;
+    explicit IndexBinaryFlat(idx_t d);
+    void add(idx_t n, const uint8_t* x) override;
+    void reset() override;
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+    void range_search(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            RangeSearchResult* result) const override;
+    void reconstruct(idx_t key, uint8_t* recons) const override;
+    /** Remove some ids. Note that because of the indexing structure,
+     * the semantics of this operation are different from the usual ones:
+     * the new ids are shifted. */
+    size_t remove_ids(const IDSelector& sel) override;
+    IndexBinaryFlat() {}
+};
+} // namespace faiss
+#endif // INDEX_BINARY_FLAT_H
--- a/faiss/IndexBinaryFromFloat.cpp
+++ b/faiss/IndexBinaryFromFloat.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexBinaryFromFloat.h>
+#include <faiss/utils/utils.h>
+#include <algorithm>
+#include <memory>
+namespace faiss {
+IndexBinaryFromFloat::IndexBinaryFromFloat() {}
+IndexBinaryFromFloat::IndexBinaryFromFloat(Index* index)
+        : IndexBinary(index->d), index(index), own_fields(false) {
+    is_trained = index->is_trained;
+    ntotal = index->ntotal;
+}
+IndexBinaryFromFloat::~IndexBinaryFromFloat() {
+    if (own_fields) {
+        delete index;
+    }
+}
+void IndexBinaryFromFloat::add(idx_t n, const uint8_t* x) {
+    constexpr idx_t bs = 32768;
+    std::unique_ptr<float[]> xf(new float[bs * d]);
+    for (idx_t b = 0; b < n; b += bs) {
+        idx_t bn = std::min(bs, n - b);
+        binary_to_real(bn * d, x + b * code_size, xf.get());
+        index->add(bn, xf.get());
+    }
+    ntotal = index->ntotal;
+}
+void IndexBinaryFromFloat::reset() {
+    index->reset();
+    ntotal = index->ntotal;
+}
+void IndexBinaryFromFloat::search(
+        idx_t n,
+        const uint8_t* x,
+        idx_t k,
+        int32_t* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    constexpr idx_t bs = 32768;
+    std::unique_ptr<float[]> xf(new float[bs * d]);
+    std::unique_ptr<float[]> df(new float[bs * k]);
+    for (idx_t b = 0; b < n; b += bs) {
+        idx_t bn = std::min(bs, n - b);
+        binary_to_real(bn * d, x + b * code_size, xf.get());
+        index->search(bn, xf.get(), k, df.get(), labels + b * k);
+        for (int i = 0; i < bn * k; ++i) {
+            distances[b * k + i] = int32_t(std::round(df[i] / 4.0));
+        }
+    }
+}
+void IndexBinaryFromFloat::train(idx_t n, const uint8_t* x) {
+    std::unique_ptr<float[]> xf(new float[n * d]);
+    binary_to_real(n * d, x, xf.get());
+    index->train(n, xf.get());
+    is_trained = true;
+    ntotal = index->ntotal;
+}
+} // namespace faiss
--- a/faiss/IndexBinaryFromFloat.h
+++ b/faiss/IndexBinaryFromFloat.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_INDEX_BINARY_FROM_FLOAT_H
+#define FAISS_INDEX_BINARY_FROM_FLOAT_H
+#include <faiss/IndexBinary.h>
+namespace faiss {
+struct Index;
+/** IndexBinary backed by a float Index.
+ *
+ * Supports adding vertices and searching them.
+ *
+ * All queries are symmetric because there is no distinction between codes and
+ * vectors.
+ */
+struct IndexBinaryFromFloat : IndexBinary {
+    Index* index = nullptr;
+    bool own_fields = false; ///< Whether object owns the index pointer.
+    IndexBinaryFromFloat();
+    explicit IndexBinaryFromFloat(Index* index);
+    ~IndexBinaryFromFloat();
+    void add(idx_t n, const uint8_t* x) override;
+    void reset() override;
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+    void train(idx_t n, const uint8_t* x) override;
+};
+} // namespace faiss
+#endif // FAISS_INDEX_BINARY_FROM_FLOAT_H
--- a/faiss/IndexBinaryHNSW.cpp
+++ b/faiss/IndexBinaryHNSW.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexBinaryHNSW.h>
+#include <omp.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <queue>
+#include <unordered_set>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/random.h>
+namespace faiss {
+/**************************************************************
+ * add / search blocks of descriptors
+ **************************************************************/
+namespace {
+void hnsw_add_vertices(
+        IndexBinaryHNSW& index_hnsw,
+        size_t n0,
+        size_t n,
+        const uint8_t* x,
+        bool verbose,
+        bool preset_levels = false) {
+    HNSW& hnsw = index_hnsw.hnsw;
+    size_t ntotal = n0 + n;
+    double t0 = getmillisecs();
+    if (verbose) {
+        printf("hnsw_add_vertices: adding %zd elements on top of %zd "
+               "(preset_levels=%d)\n",
+               n,
+               n0,
+               int(preset_levels));
+    }
+    int max_level = hnsw.prepare_level_tab(n, preset_levels);
+    if (verbose) {
+        printf("  max_level = %d\n", max_level);
+    }
+    std::vector<omp_lock_t> locks(ntotal);
+    for (int i = 0; i < ntotal; i++) {
+        omp_init_lock(&locks[i]);
+    }
+    // add vectors from highest to lowest level
+    std::vector<int> hist;
+    std::vector<int> order(n);
+    { // make buckets with vectors of the same level
+        // build histogram
+        for (int i = 0; i < n; i++) {
+            HNSW::storage_idx_t pt_id = i + n0;
+            int pt_level = hnsw.levels[pt_id] - 1;
+            while (pt_level >= hist.size()) {
+                hist.push_back(0);
+            }
+            hist[pt_level]++;
+        }
+        // accumulate
+        std::vector<int> offsets(hist.size() + 1, 0);
+        for (int i = 0; i < hist.size() - 1; i++) {
+            offsets[i + 1] = offsets[i] + hist[i];
+        }
+        // bucket sort
+        for (int i = 0; i < n; i++) {
+            HNSW::storage_idx_t pt_id = i + n0;
+            int pt_level = hnsw.levels[pt_id] - 1;
+            order[offsets[pt_level]++] = pt_id;
+        }
+    }
+    { // perform add
+        RandomGenerator rng2(789);
+        int i1 = n;
+        for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
+            int i0 = i1 - hist[pt_level];
+            if (verbose) {
+                printf("Adding %d elements at level %d\n", i1 - i0, pt_level);
+            }
+            // random permutation to get rid of dataset order bias
+            for (int j = i0; j < i1; j++) {
+                std::swap(order[j], order[j + rng2.rand_int(i1 - j)]);
+            }
+#pragma omp parallel
+            {
+                VisitedTable vt(ntotal);
+                std::unique_ptr<DistanceComputer> dis(
+                        index_hnsw.get_distance_computer());
+                int prev_display =
+                        verbose && omp_get_thread_num() == 0 ? 0 : -1;
+#pragma omp for schedule(dynamic)
+                for (int i = i0; i < i1; i++) {
+                    HNSW::storage_idx_t pt_id = order[i];
+                    dis->set_query(
+                            (float*)(x + (pt_id - n0) * index_hnsw.code_size));
+                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
+                    if (prev_display >= 0 && i - i0 > prev_display + 10000) {
+                        prev_display = i - i0;
+                        printf("  %d / %d\r", i - i0, i1 - i0);
+                        fflush(stdout);
+                    }
+                }
+            }
+            i1 = i0;
+        }
+        FAISS_ASSERT(i1 == 0);
+    }
+    if (verbose) {
+        printf("Done in %.3f ms\n", getmillisecs() - t0);
+    }
+    for (int i = 0; i < ntotal; i++)
+        omp_destroy_lock(&locks[i]);
+}
+} // anonymous namespace
+/**************************************************************
+ * IndexBinaryHNSW implementation
+ **************************************************************/
+IndexBinaryHNSW::IndexBinaryHNSW() {
+    is_trained = true;
+}
+IndexBinaryHNSW::IndexBinaryHNSW(int d, int M)
+        : IndexBinary(d),
+          hnsw(M),
+          own_fields(true),
+          storage(new IndexBinaryFlat(d)) {
+    is_trained = true;
+}
+IndexBinaryHNSW::IndexBinaryHNSW(IndexBinary* storage, int M)
+        : IndexBinary(storage->d),
+          hnsw(M),
+          own_fields(false),
+          storage(storage) {
+    is_trained = true;
+}
+IndexBinaryHNSW::~IndexBinaryHNSW() {
+    if (own_fields) {
+        delete storage;
+    }
+}
+void IndexBinaryHNSW::train(idx_t n, const uint8_t* x) {
+    // hnsw structure does not require training
+    storage->train(n, x);
+    is_trained = true;
+}
+void IndexBinaryHNSW::search(
+        idx_t n,
+        const uint8_t* x,
+        idx_t k,
+        int32_t* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+#pragma omp parallel
+    {
+        VisitedTable vt(ntotal);
+        std::unique_ptr<DistanceComputer> dis(get_distance_computer());
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            idx_t* idxi = labels + i * k;
+            float* simi = (float*)(distances + i * k);
+            dis->set_query((float*)(x + i * code_size));
+            maxheap_heapify(k, simi, idxi);
+            hnsw.search(*dis, k, idxi, simi, vt);
+            maxheap_reorder(k, simi, idxi);
+        }
+    }
+#pragma omp parallel for
+    for (int i = 0; i < n * k; ++i) {
+        distances[i] = std::round(((float*)distances)[i]);
+    }
+}
+void IndexBinaryHNSW::add(idx_t n, const uint8_t* x) {
+    FAISS_THROW_IF_NOT(is_trained);
+    int n0 = ntotal;
+    storage->add(n, x);
+    ntotal = storage->ntotal;
+    hnsw_add_vertices(*this, n0, n, x, verbose, hnsw.levels.size() == ntotal);
+}
+void IndexBinaryHNSW::reset() {
+    hnsw.reset();
+    storage->reset();
+    ntotal = 0;
+}
+void IndexBinaryHNSW::reconstruct(idx_t key, uint8_t* recons) const {
+    storage->reconstruct(key, recons);
+}
+namespace {
+template <class HammingComputer>
+struct FlatHammingDis : DistanceComputer {
+    const int code_size;
+    const uint8_t* b;
+    size_t ndis;
+    HammingComputer hc;
+    float operator()(idx_t i) override {
+        ndis++;
+        return hc.hamming(b + i * code_size);
+    }
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return HammingComputerDefault(b + j * code_size, code_size)
+                .hamming(b + i * code_size);
+    }
+    explicit FlatHammingDis(const IndexBinaryFlat& storage)
+            : code_size(storage.code_size),
+              b(storage.xb.data()),
+              ndis(0),
+              hc() {}
+    // NOTE: Pointers are cast from float in order to reuse the floating-point
+    //   DistanceComputer.
+    void set_query(const float* x) override {
+        hc.set((uint8_t*)x, code_size);
+    }
+    ~FlatHammingDis() override {
+#pragma omp critical
+        { hnsw_stats.ndis += ndis; }
+    }
+};
+} // namespace
+DistanceComputer* IndexBinaryHNSW::get_distance_computer() const {
+    IndexBinaryFlat* flat_storage = dynamic_cast<IndexBinaryFlat*>(storage);
+    FAISS_ASSERT(flat_storage != nullptr);
+    switch (code_size) {
+        case 4:
+            return new FlatHammingDis<HammingComputer4>(*flat_storage);
+        case 8:
+            return new FlatHammingDis<HammingComputer8>(*flat_storage);
+        case 16:
+            return new FlatHammingDis<HammingComputer16>(*flat_storage);
+        case 20:
+            return new FlatHammingDis<HammingComputer20>(*flat_storage);
+        case 32:
+            return new FlatHammingDis<HammingComputer32>(*flat_storage);
+        case 64:
+            return new FlatHammingDis<HammingComputer64>(*flat_storage);
+        default:
+            break;
+    }
+    return new FlatHammingDis<HammingComputerDefault>(*flat_storage);
+}
+} // namespace faiss
--- a/faiss/IndexBinaryHNSW.h
+++ b/faiss/IndexBinaryHNSW.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#pragma once
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/impl/HNSW.h>
+#include <faiss/utils/utils.h>
+namespace faiss {
+/** The HNSW index is a normal random-access index with a HNSW
+ * link structure built on top */
+struct IndexBinaryHNSW : IndexBinary {
+    typedef HNSW::storage_idx_t storage_idx_t;
+    // the link strcuture
+    HNSW hnsw;
+    // the sequential storage
+    bool own_fields;
+    IndexBinary* storage;
+    explicit IndexBinaryHNSW();
+    explicit IndexBinaryHNSW(int d, int M = 32);
+    explicit IndexBinaryHNSW(IndexBinary* storage, int M = 32);
+    ~IndexBinaryHNSW() override;
+    DistanceComputer* get_distance_computer() const;
+    void add(idx_t n, const uint8_t* x) override;
+    /// Trains the storage if needed
+    void train(idx_t n, const uint8_t* x) override;
+    /// entry point for search
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+    void reconstruct(idx_t key, uint8_t* recons) const override;
+    void reset() override;
+};
+} // namespace faiss
--- a/faiss/IndexBinaryHash.cpp
+++ b/faiss/IndexBinaryHash.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexBinaryHash.h>
+#include <cinttypes>
+#include <cstdio>
+#include <memory>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/platform_macros.h>
+namespace faiss {
+void IndexBinaryHash::InvertedList::add(
+        idx_t id,
+        size_t code_size,
+        const uint8_t* code) {
+    ids.push_back(id);
+    vecs.insert(vecs.end(), code, code + code_size);
+}
+IndexBinaryHash::IndexBinaryHash(int d, int b)
+        : IndexBinary(d), b(b), nflip(0) {
+    is_trained = true;
+}
+IndexBinaryHash::IndexBinaryHash() : b(0), nflip(0) {
+    is_trained = true;
+}
+void IndexBinaryHash::reset() {
+    invlists.clear();
+    ntotal = 0;
+}
+void IndexBinaryHash::add(idx_t n, const uint8_t* x) {
+    add_with_ids(n, x, nullptr);
+}
+void IndexBinaryHash::add_with_ids(
+        idx_t n,
+        const uint8_t* x,
+        const idx_t* xids) {
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+    // simplistic add function. Cannot really be parallelized.
+    for (idx_t i = 0; i < n; i++) {
+        idx_t id = xids ? xids[i] : ntotal + i;
+        const uint8_t* xi = x + i * code_size;
+        idx_t hash = *((uint64_t*)xi) & mask;
+        invlists[hash].add(id, code_size, xi);
+    }
+    ntotal += n;
+}
+namespace {
+/** Enumerate all bit vectors of size nbit with up to maxflip 1s
+ * test in P127257851 P127258235
+ */
+struct FlipEnumerator {
+    int nbit, nflip, maxflip;
+    uint64_t mask, x;
+    FlipEnumerator(int nbit, int maxflip) : nbit(nbit), maxflip(maxflip) {
+        nflip = 0;
+        mask = 0;
+        x = 0;
+    }
+    bool next() {
+        if (x == mask) {
+            if (nflip == maxflip) {
+                return false;
+            }
+            // increase Hamming radius
+            nflip++;
+            mask = (((uint64_t)1 << nflip) - 1);
+            x = mask << (nbit - nflip);
+            return true;
+        }
+        int i = __builtin_ctzll(x);
+        if (i > 0) {
+            x ^= (uint64_t)3 << (i - 1);
+        } else {
+            // nb of LSB 1s
+            int n1 = __builtin_ctzll(~x);
+            // clear them
+            x &= ((uint64_t)(-1) << n1);
+            int n2 = __builtin_ctzll(x);
+            x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1);
+        }
+        return true;
+    }
+};
+using idx_t = Index::idx_t;
+struct RangeSearchResults {
+    int radius;
+    RangeQueryResult& qres;
+    inline void add(float dis, idx_t id) {
+        if (dis < radius) {
+            qres.add(dis, id);
+        }
+    }
+};
+struct KnnSearchResults {
+    // heap params
+    idx_t k;
+    int32_t* heap_sim;
+    idx_t* heap_ids;
+    using C = CMax<int, idx_t>;
+    inline void add(float dis, idx_t id) {
+        if (dis < heap_sim[0]) {
+            heap_replace_top<C>(k, heap_sim, heap_ids, dis, id);
+        }
+    }
+};
+template <class HammingComputer, class SearchResults>
+void search_single_query_template(
+        const IndexBinaryHash& index,
+        const uint8_t* q,
+        SearchResults& res,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis) {
+    size_t code_size = index.code_size;
+    uint64_t mask = ((uint64_t)1 << index.b) - 1;
+    uint64_t qhash = *((uint64_t*)q) & mask;
+    HammingComputer hc(q, code_size);
+    FlipEnumerator fe(index.b, index.nflip);
+    // loop over neighbors that are at most at nflip bits
+    do {
+        uint64_t hash = qhash ^ fe.x;
+        auto it = index.invlists.find(hash);
+        if (it == index.invlists.end()) {
+            continue;
+        }
+        const IndexBinaryHash::InvertedList& il = it->second;
+        size_t nv = il.ids.size();
+        if (nv == 0) {
+            n0++;
+        } else {
+            const uint8_t* codes = il.vecs.data();
+            for (size_t i = 0; i < nv; i++) {
+                int dis = hc.hamming(codes);
+                res.add(dis, il.ids[i]);
+                codes += code_size;
+            }
+            ndis += nv;
+            nlist++;
+        }
+    } while (fe.next());
+}
+template <class SearchResults>
+void search_single_query(
+        const IndexBinaryHash& index,
+        const uint8_t* q,
+        SearchResults& res,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis) {
+#define HC(name) \
+    search_single_query_template<name>(index, q, res, n0, nlist, ndis);
+    switch (index.code_size) {
+        case 4:
+            HC(HammingComputer4);
+            break;
+        case 8:
+            HC(HammingComputer8);
+            break;
+        case 16:
+            HC(HammingComputer16);
+            break;
+        case 20:
+            HC(HammingComputer20);
+            break;
+        case 32:
+            HC(HammingComputer32);
+            break;
+        default:
+            HC(HammingComputerDefault);
+            break;
+    }
+#undef HC
+}
+} // anonymous namespace
+void IndexBinaryHash::range_search(
+        idx_t n,
+        const uint8_t* x,
+        int radius,
+        RangeSearchResult* result) const {
+    size_t nlist = 0, ndis = 0, n0 = 0;
+#pragma omp parallel if (n > 100) reduction(+ : ndis, n0, nlist)
+    {
+        RangeSearchPartialResult pres(result);
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) { // loop queries
+            RangeQueryResult& qres = pres.new_result(i);
+            RangeSearchResults res = {radius, qres};
+            const uint8_t* q = x + i * code_size;
+            search_single_query(*this, q, res, n0, nlist, ndis);
+        }
+        pres.finalize();
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+void IndexBinaryHash::search(
+        idx_t n,
+        const uint8_t* x,
+        idx_t k,
+        int32_t* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    using HeapForL2 = CMax<int32_t, idx_t>;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+#pragma omp parallel for if (n > 100) reduction(+ : nlist, ndis, n0)
+    for (idx_t i = 0; i < n; i++) {
+        int32_t* simi = distances + k * i;
+        idx_t* idxi = labels + k * i;
+        heap_heapify<HeapForL2>(k, simi, idxi);
+        KnnSearchResults res = {k, simi, idxi};
+        const uint8_t* q = x + i * code_size;
+        search_single_query(*this, q, res, n0, nlist, ndis);
+        heap_reorder<HeapForL2>(k, simi, idxi);
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+size_t IndexBinaryHash::hashtable_size() const {
+    return invlists.size();
+}
+void IndexBinaryHash::display() const {
+    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
+        printf("%" PRId64 ": [", it->first);
+        const std::vector<idx_t>& v = it->second.ids;
+        for (auto x : v) {
+            printf("%" PRId64 " ", x);
+        }
+        printf("]\n");
+    }
+}
+void IndexBinaryHashStats::reset() {
+    memset((void*)this, 0, sizeof(*this));
+}
+IndexBinaryHashStats indexBinaryHash_stats;
+/*******************************************************
+ * IndexBinaryMultiHash implementation
+ ******************************************************/
+IndexBinaryMultiHash::IndexBinaryMultiHash(int d, int nhash, int b)
+        : IndexBinary(d),
+          storage(new IndexBinaryFlat(d)),
+          own_fields(true),
+          maps(nhash),
+          nhash(nhash),
+          b(b),
+          nflip(0) {
+    FAISS_THROW_IF_NOT(nhash * b <= d);
+}
+IndexBinaryMultiHash::IndexBinaryMultiHash()
+        : storage(nullptr), own_fields(true), nhash(0), b(0), nflip(0) {}
+IndexBinaryMultiHash::~IndexBinaryMultiHash() {
+    if (own_fields) {
+        delete storage;
+    }
+}
+void IndexBinaryMultiHash::reset() {
+    storage->reset();
+    ntotal = 0;
+    for (auto map : maps) {
+        map.clear();
+    }
+}
+void IndexBinaryMultiHash::add(idx_t n, const uint8_t* x) {
+    storage->add(n, x);
+    // populate maps
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+    for (idx_t i = 0; i < n; i++) {
+        const uint8_t* xi = x + i * code_size;
+        int ho = 0;
+        for (int h = 0; h < nhash; h++) {
+            uint64_t hash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
+            hash &= mask;
+            maps[h][hash].push_back(i + ntotal);
+            ho += b;
+        }
+    }
+    ntotal += n;
+}
+namespace {
+template <class HammingComputer, class SearchResults>
+static void verify_shortlist(
+        const IndexBinaryFlat& index,
+        const uint8_t* q,
+        const std::unordered_set<Index::idx_t>& shortlist,
+        SearchResults& res) {
+    size_t code_size = index.code_size;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+    HammingComputer hc(q, code_size);
+    const uint8_t* codes = index.xb.data();
+    for (auto i : shortlist) {
+        int dis = hc.hamming(codes + i * code_size);
+        res.add(dis, i);
+    }
+}
+template <class SearchResults>
+void search_1_query_multihash(
+        const IndexBinaryMultiHash& index,
+        const uint8_t* xi,
+        SearchResults& res,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis) {
+    std::unordered_set<idx_t> shortlist;
+    int b = index.b;
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+    int ho = 0;
+    for (int h = 0; h < index.nhash; h++) {
+        uint64_t qhash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
+        qhash &= mask;
+        const IndexBinaryMultiHash::Map& map = index.maps[h];
+        FlipEnumerator fe(index.b, index.nflip);
+        // loop over neighbors that are at most at nflip bits
+        do {
+            uint64_t hash = qhash ^ fe.x;
+            auto it = map.find(hash);
+            if (it != map.end()) {
+                const std::vector<idx_t>& v = it->second;
+                for (auto i : v) {
+                    shortlist.insert(i);
+                }
+                nlist++;
+            } else {
+                n0++;
+            }
+        } while (fe.next());
+        ho += b;
+    }
+    ndis += shortlist.size();
+    // verify shortlist
+#define HC(name) verify_shortlist<name>(*index.storage, xi, shortlist, res)
+    switch (index.code_size) {
+        case 4:
+            HC(HammingComputer4);
+            break;
+        case 8:
+            HC(HammingComputer8);
+            break;
+        case 16:
+            HC(HammingComputer16);
+            break;
+        case 20:
+            HC(HammingComputer20);
+            break;
+        case 32:
+            HC(HammingComputer32);
+            break;
+        default:
+            HC(HammingComputerDefault);
+            break;
+    }
+#undef HC
+}
+} // anonymous namespace
+void IndexBinaryMultiHash::range_search(
+        idx_t n,
+        const uint8_t* x,
+        int radius,
+        RangeSearchResult* result) const {
+    size_t nlist = 0, ndis = 0, n0 = 0;
+#pragma omp parallel if (n > 100) reduction(+ : ndis, n0, nlist)
+    {
+        RangeSearchPartialResult pres(result);
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) { // loop queries
+            RangeQueryResult& qres = pres.new_result(i);
+            RangeSearchResults res = {radius, qres};
+            const uint8_t* q = x + i * code_size;
+            search_1_query_multihash(*this, q, res, n0, nlist, ndis);
+        }
+        pres.finalize();
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+void IndexBinaryMultiHash::search(
+        idx_t n,
+        const uint8_t* x,
+        idx_t k,
+        int32_t* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    using HeapForL2 = CMax<int32_t, idx_t>;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+#pragma omp parallel for if (n > 100) reduction(+ : nlist, ndis, n0)
+    for (idx_t i = 0; i < n; i++) {
+        int32_t* simi = distances + k * i;
+        idx_t* idxi = labels + k * i;
+        heap_heapify<HeapForL2>(k, simi, idxi);
+        KnnSearchResults res = {k, simi, idxi};
+        const uint8_t* q = x + i * code_size;
+        search_1_query_multihash(*this, q, res, n0, nlist, ndis);
+        heap_reorder<HeapForL2>(k, simi, idxi);
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+size_t IndexBinaryMultiHash::hashtable_size() const {
+    size_t tot = 0;
+    for (auto map : maps) {
+        tot += map.size();
+    }
+    return tot;
+}
+} // namespace faiss
--- a/faiss/IndexBinaryHash.h
+++ b/faiss/IndexBinaryHash.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_BINARY_HASH_H
+#define FAISS_BINARY_HASH_H
+#include <unordered_map>
+#include <vector>
+#include <faiss/IndexBinary.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/Heap.h>
+namespace faiss {
+struct RangeSearchResult;
+/** just uses the b first bits as a hash value */
+struct IndexBinaryHash : IndexBinary {
+    struct InvertedList {
+        std::vector<idx_t> ids;
+        std::vector<uint8_t> vecs;
+        void add(idx_t id, size_t code_size, const uint8_t* code);
+    };
+    using InvertedListMap = std::unordered_map<idx_t, InvertedList>;
+    InvertedListMap invlists;
+    int b, nflip;
+    IndexBinaryHash(int d, int b);
+    IndexBinaryHash();
+    void reset() override;
+    void add(idx_t n, const uint8_t* x) override;
+    void add_with_ids(idx_t n, const uint8_t* x, const idx_t* xids) override;
+    void range_search(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            RangeSearchResult* result) const override;
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+    void display() const;
+    size_t hashtable_size() const;
+};
+struct IndexBinaryHashStats {
+    size_t nq;    // nb of queries run
+    size_t n0;    // nb of empty lists
+    size_t nlist; // nb of non-empty inverted lists scanned
+    size_t ndis;  // nb of distancs computed
+    IndexBinaryHashStats() {
+        reset();
+    }
+    void reset();
+};
+FAISS_API extern IndexBinaryHashStats indexBinaryHash_stats;
+/** just uses the b first bits as a hash value */
+struct IndexBinaryMultiHash : IndexBinary {
+    // where the vectors are actually stored
+    IndexBinaryFlat* storage;
+    bool own_fields;
+    // maps hash values to the ids that hash to them
+    using Map = std::unordered_map<idx_t, std::vector<idx_t>>;
+    // the different hashes, size nhash
+    std::vector<Map> maps;
+    int nhash; ///< nb of hash maps
+    int b;     ///< nb bits per hash map
+    int nflip; ///< nb bit flips to use at search time
+    IndexBinaryMultiHash(int d, int nhash, int b);
+    IndexBinaryMultiHash();
+    ~IndexBinaryMultiHash();
+    void reset() override;
+    void add(idx_t n, const uint8_t* x) override;
+    void range_search(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            RangeSearchResult* result) const override;
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+    size_t hashtable_size() const;
+};
+} // namespace faiss
+#endif
--- a/faiss/IndexBinaryIVF.cpp
+++ b/faiss/IndexBinaryIVF.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexBinaryIVF.h>
+#include <omp.h>
+#include <cinttypes>
+#include <cstdio>
+#include <algorithm>
+#include <memory>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexLSH.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+namespace faiss {
+IndexBinaryIVF::IndexBinaryIVF(IndexBinary* quantizer, size_t d, size_t nlist)
+        : IndexBinary(d),
+          invlists(new ArrayInvertedLists(nlist, code_size)),
+          own_invlists(true),
+          nprobe(1),
+          max_codes(0),
+          quantizer(quantizer),
+          nlist(nlist),
+          own_fields(false),
+          clustering_index(nullptr) {
+    FAISS_THROW_IF_NOT(d == quantizer->d);
+    is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
+    cp.niter = 10;
+}
+IndexBinaryIVF::IndexBinaryIVF()
+        : invlists(nullptr),
+          own_invlists(false),
+          nprobe(1),
+          max_codes(0),
+          quantizer(nullptr),
+          nlist(0),
+          own_fields(false),
+          clustering_index(nullptr) {}
+void IndexBinaryIVF::add(idx_t n, const uint8_t* x) {
+    add_with_ids(n, x, nullptr);
+}
+void IndexBinaryIVF::add_with_ids(
+        idx_t n,
+        const uint8_t* x,
+        const idx_t* xids) {
+    add_core(n, x, xids, nullptr);
+}
+void IndexBinaryIVF::add_core(
+        idx_t n,
+        const uint8_t* x,
+        const idx_t* xids,
+        const idx_t* precomputed_idx) {
+    FAISS_THROW_IF_NOT(is_trained);
+    assert(invlists);
+    direct_map.check_can_add(xids);
+    const idx_t* idx;
+    std::unique_ptr<idx_t[]> scoped_idx;
+    if (precomputed_idx) {
+        idx = precomputed_idx;
+    } else {
+        scoped_idx.reset(new idx_t[n]);
+        quantizer->assign(n, x, scoped_idx.get());
+        idx = scoped_idx.get();
+    }
+    idx_t n_add = 0;
+    for (size_t i = 0; i < n; i++) {
+        idx_t id = xids ? xids[i] : ntotal + i;
+        idx_t list_no = idx[i];
+        if (list_no < 0) {
+            direct_map.add_single_id(id, -1, 0);
+        } else {
+            const uint8_t* xi = x + i * code_size;
+            size_t offset = invlists->add_entry(list_no, id, xi);
+            direct_map.add_single_id(id, list_no, offset);
+        }
+        n_add++;
+    }
+    if (verbose) {
+        printf("IndexBinaryIVF::add_with_ids: added "
+               "%" PRId64 " / %" PRId64 " vectors\n",
+               n_add,
+               n);
+    }
+    ntotal += n_add;
+}
+void IndexBinaryIVF::make_direct_map(bool b) {
+    if (b) {
+        direct_map.set_type(DirectMap::Array, invlists, ntotal);
+    } else {
+        direct_map.set_type(DirectMap::NoMap, invlists, ntotal);
+    }
+}
+void IndexBinaryIVF::set_direct_map_type(DirectMap::Type type) {
+    direct_map.set_type(type, invlists, ntotal);
+}
+void IndexBinaryIVF::search(
+        idx_t n,
+        const uint8_t* x,
+        idx_t k,
+        int32_t* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    FAISS_THROW_IF_NOT(nprobe > 0);
+    const size_t nprobe = std::min(nlist, this->nprobe);
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+    double t0 = getmillisecs();
+    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+    t0 = getmillisecs();
+    invlists->prefetch_lists(idx.get(), n * nprobe);
+    search_preassigned(
+            n, x, k, idx.get(), coarse_dis.get(), distances, labels, false);
+    indexIVF_stats.search_time += getmillisecs() - t0;
+}
+void IndexBinaryIVF::reconstruct(idx_t key, uint8_t* recons) const {
+    idx_t lo = direct_map.get(key);
+    reconstruct_from_offset(lo_listno(lo), lo_offset(lo), recons);
+}
+void IndexBinaryIVF::reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const {
+    FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
+    for (idx_t list_no = 0; list_no < nlist; list_no++) {
+        size_t list_size = invlists->list_size(list_no);
+        const Index::idx_t* idlist = invlists->get_ids(list_no);
+        for (idx_t offset = 0; offset < list_size; offset++) {
+            idx_t id = idlist[offset];
+            if (!(id >= i0 && id < i0 + ni)) {
+                continue;
+            }
+            uint8_t* reconstructed = recons + (id - i0) * d;
+            reconstruct_from_offset(list_no, offset, reconstructed);
+        }
+    }
+}
+void IndexBinaryIVF::search_and_reconstruct(
+        idx_t n,
+        const uint8_t* x,
+        idx_t k,
+        int32_t* distances,
+        idx_t* labels,
+        uint8_t* recons) const {
+    const size_t nprobe = std::min(nlist, this->nprobe);
+    FAISS_THROW_IF_NOT(k > 0);
+    FAISS_THROW_IF_NOT(nprobe > 0);
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    invlists->prefetch_lists(idx.get(), n * nprobe);
+    // search_preassigned() with `store_pairs` enabled to obtain the list_no
+    // and offset into `codes` for reconstruction
+    search_preassigned(
+            n,
+            x,
+            k,
+            idx.get(),
+            coarse_dis.get(),
+            distances,
+            labels,
+            /* store_pairs */ true);
+    for (idx_t i = 0; i < n; ++i) {
+        for (idx_t j = 0; j < k; ++j) {
+            idx_t ij = i * k + j;
+            idx_t key = labels[ij];
+            uint8_t* reconstructed = recons + ij * d;
+            if (key < 0) {
+                // Fill with NaNs
+                memset(reconstructed, -1, sizeof(*reconstructed) * d);
+            } else {
+                int list_no = key >> 32;
+                int offset = key & 0xffffffff;
+                // Update label to the actual id
+                labels[ij] = invlists->get_single_id(list_no, offset);
+                reconstruct_from_offset(list_no, offset, reconstructed);
+            }
+        }
+    }
+}
+void IndexBinaryIVF::reconstruct_from_offset(
+        idx_t list_no,
+        idx_t offset,
+        uint8_t* recons) const {
+    memcpy(recons, invlists->get_single_code(list_no, offset), code_size);
+}
+void IndexBinaryIVF::reset() {
+    direct_map.clear();
+    invlists->reset();
+    ntotal = 0;
+}
+size_t IndexBinaryIVF::remove_ids(const IDSelector& sel) {
+    size_t nremove = direct_map.remove_ids(sel, invlists);
+    ntotal -= nremove;
+    return nremove;
+}
+void IndexBinaryIVF::train(idx_t n, const uint8_t* x) {
+    if (verbose) {
+        printf("Training quantizer\n");
+    }
+    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+        if (verbose) {
+            printf("IVF quantizer does not need training.\n");
+        }
+    } else {
+        if (verbose) {
+            printf("Training quantizer on %" PRId64 " vectors in %dD\n", n, d);
+        }
+        Clustering clus(d, nlist, cp);
+        quantizer->reset();
+        IndexFlatL2 index_tmp(d);
+        if (clustering_index && verbose) {
+            printf("using clustering_index of dimension %d to do the clustering\n",
+                   clustering_index->d);
+        }
+        // LSH codec that is able to convert the binary vectors to floats.
+        IndexLSH codec(d, d, false, false);
+        clus.train_encoded(
+                n, x, &codec, clustering_index ? *clustering_index : index_tmp);
+        // convert clusters to binary
+        std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
+        real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());
+        quantizer->add(clus.k, x_b.get());
+        quantizer->is_trained = true;
+    }
+    is_trained = true;
+}
+void IndexBinaryIVF::merge_from(IndexBinaryIVF& other, idx_t add_id) {
+    // minimal sanity checks
+    FAISS_THROW_IF_NOT(other.d == d);
+    FAISS_THROW_IF_NOT(other.nlist == nlist);
+    FAISS_THROW_IF_NOT(other.code_size == code_size);
+    FAISS_THROW_IF_NOT_MSG(
+            direct_map.no() && other.direct_map.no(),
+            "direct map copy not implemented");
+    FAISS_THROW_IF_NOT_MSG(
+            typeid(*this) == typeid(other),
+            "can only merge indexes of the same type");
+    invlists->merge_from(other.invlists, add_id);
+    ntotal += other.ntotal;
+    other.ntotal = 0;
+}
+void IndexBinaryIVF::replace_invlists(InvertedLists* il, bool own) {
+    FAISS_THROW_IF_NOT(il->nlist == nlist && il->code_size == code_size);
+    if (own_invlists) {
+        delete invlists;
+    }
+    invlists = il;
+    own_invlists = own;
+}
+namespace {
+using idx_t = Index::idx_t;
+template <class HammingComputer>
+struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
+    HammingComputer hc;
+    size_t code_size;
+    bool store_pairs;
+    IVFBinaryScannerL2(size_t code_size, bool store_pairs)
+            : code_size(code_size), store_pairs(store_pairs) {}
+    void set_query(const uint8_t* query_vector) override {
+        hc.set(query_vector, code_size);
+    }
+    idx_t list_no;
+    void set_list(idx_t list_no, uint8_t /* coarse_dis */) override {
+        this->list_no = list_no;
+    }
+    uint32_t distance_to_code(const uint8_t* code) const override {
+        return hc.hamming(code);
+    }
+    size_t scan_codes(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            int32_t* simi,
+            idx_t* idxi,
+            size_t k) const override {
+        using C = CMax<int32_t, idx_t>;
+        size_t nup = 0;
+        for (size_t j = 0; j < n; j++) {
+            uint32_t dis = hc.hamming(codes);
+            if (dis < simi[0]) {
+                idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                heap_replace_top<C>(k, simi, idxi, dis, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+        return nup;
+    }
+    void scan_codes_range(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            int radius,
+            RangeQueryResult& result) const override {
+        size_t nup = 0;
+        for (size_t j = 0; j < n; j++) {
+            uint32_t dis = hc.hamming(codes);
+            if (dis < radius) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                result.add(dis, id);
+            }
+            codes += code_size;
+        }
+    }
+};
+void search_knn_hamming_heap(
+        const IndexBinaryIVF& ivf,
+        size_t n,
+        const uint8_t* x,
+        idx_t k,
+        const idx_t* keys,
+        const int32_t* coarse_dis,
+        int32_t* distances,
+        idx_t* labels,
+        bool store_pairs,
+        const IVFSearchParameters* params) {
+    idx_t nprobe = params ? params->nprobe : ivf.nprobe;
+    nprobe = std::min((idx_t)ivf.nlist, nprobe);
+    idx_t max_codes = params ? params->max_codes : ivf.max_codes;
+    MetricType metric_type = ivf.metric_type;
+    // almost verbatim copy from IndexIVF::search_preassigned
+    size_t nlistv = 0, ndis = 0, nheap = 0;
+    using HeapForIP = CMin<int32_t, idx_t>;
+    using HeapForL2 = CMax<int32_t, idx_t>;
+#pragma omp parallel if (n > 1) reduction(+ : nlistv, ndis, nheap)
+    {
+        std::unique_ptr<BinaryInvertedListScanner> scanner(
+                ivf.get_InvertedListScanner(store_pairs));
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const uint8_t* xi = x + i * ivf.code_size;
+            scanner->set_query(xi);
+            const idx_t* keysi = keys + i * nprobe;
+            int32_t* simi = distances + k * i;
+            idx_t* idxi = labels + k * i;
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_heapify<HeapForIP>(k, simi, idxi);
+            } else {
+                heap_heapify<HeapForL2>(k, simi, idxi);
+            }
+            size_t nscan = 0;
+            for (size_t ik = 0; ik < nprobe; ik++) {
+                idx_t key = keysi[ik]; /* select the list  */
+                if (key < 0) {
+                    // not enough centroids for multiprobe
+                    continue;
+                }
+                FAISS_THROW_IF_NOT_FMT(
+                        key < (idx_t)ivf.nlist,
+                        "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
+                        key,
+                        ik,
+                        ivf.nlist);
+                scanner->set_list(key, coarse_dis[i * nprobe + ik]);
+                nlistv++;
+                size_t list_size = ivf.invlists->list_size(key);
+                InvertedLists::ScopedCodes scodes(ivf.invlists, key);
+                std::unique_ptr<InvertedLists::ScopedIds> sids;
+                const Index::idx_t* ids = nullptr;
+                if (!store_pairs) {
+                    sids.reset(new InvertedLists::ScopedIds(ivf.invlists, key));
+                    ids = sids->get();
+                }
+                nheap += scanner->scan_codes(
+                        list_size, scodes.get(), ids, simi, idxi, k);
+                nscan += list_size;
+                if (max_codes && nscan >= max_codes)
+                    break;
+            }
+            ndis += nscan;
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_reorder<HeapForIP>(k, simi, idxi);
+            } else {
+                heap_reorder<HeapForL2>(k, simi, idxi);
+            }
+        } // parallel for
+    }     // parallel
+    indexIVF_stats.nq += n;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nheap_updates += nheap;
+}
+template <class HammingComputer, bool store_pairs>
+void search_knn_hamming_count(
+        const IndexBinaryIVF& ivf,
+        size_t nx,
+        const uint8_t* x,
+        const idx_t* keys,
+        int k,
+        int32_t* distances,
+        idx_t* labels,
+        const IVFSearchParameters* params) {
+    const int nBuckets = ivf.d + 1;
+    std::vector<int> all_counters(nx * nBuckets, 0);
+    std::unique_ptr<idx_t[]> all_ids_per_dis(new idx_t[nx * nBuckets * k]);
+    idx_t nprobe = params ? params->nprobe : ivf.nprobe;
+    nprobe = std::min((idx_t)ivf.nlist, nprobe);
+    idx_t max_codes = params ? params->max_codes : ivf.max_codes;
+    std::vector<HCounterState<HammingComputer>> cs;
+    for (size_t i = 0; i < nx; ++i) {
+        cs.push_back(HCounterState<HammingComputer>(
+                all_counters.data() + i * nBuckets,
+                all_ids_per_dis.get() + i * nBuckets * k,
+                x + i * ivf.code_size,
+                ivf.d,
+                k));
+    }
+    size_t nlistv = 0, ndis = 0;
+#pragma omp parallel for reduction(+ : nlistv, ndis)
+    for (int64_t i = 0; i < nx; i++) {
+        const idx_t* keysi = keys + i * nprobe;
+        HCounterState<HammingComputer>& csi = cs[i];
+        size_t nscan = 0;
+        for (size_t ik = 0; ik < nprobe; ik++) {
+            idx_t key = keysi[ik]; /* select the list  */
+            if (key < 0) {
+                // not enough centroids for multiprobe
+                continue;
+            }
+            FAISS_THROW_IF_NOT_FMT(
+                    key < (idx_t)ivf.nlist,
+                    "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
+                    key,
+                    ik,
+                    ivf.nlist);
+            nlistv++;
+            size_t list_size = ivf.invlists->list_size(key);
+            InvertedLists::ScopedCodes scodes(ivf.invlists, key);
+            const uint8_t* list_vecs = scodes.get();
+            const Index::idx_t* ids =
+                    store_pairs ? nullptr : ivf.invlists->get_ids(key);
+            for (size_t j = 0; j < list_size; j++) {
+                const uint8_t* yj = list_vecs + ivf.code_size * j;
+                idx_t id = store_pairs ? (key << 32 | j) : ids[j];
+                csi.update_counter(yj, id);
+            }
+            if (ids)
+                ivf.invlists->release_ids(key, ids);
+            nscan += list_size;
+            if (max_codes && nscan >= max_codes)
+                break;
+        }
+        ndis += nscan;
+        int nres = 0;
+        for (int b = 0; b < nBuckets && nres < k; b++) {
+            for (int l = 0; l < csi.counters[b] && nres < k; l++) {
+                labels[i * k + nres] = csi.ids_per_dis[b * k + l];
+                distances[i * k + nres] = b;
+                nres++;
+            }
+        }
+        while (nres < k) {
+            labels[i * k + nres] = -1;
+            distances[i * k + nres] = std::numeric_limits<int32_t>::max();
+            ++nres;
+        }
+    }
+    indexIVF_stats.nq += nx;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+}
+template <bool store_pairs>
+void search_knn_hamming_count_1(
+        const IndexBinaryIVF& ivf,
+        size_t nx,
+        const uint8_t* x,
+        const idx_t* keys,
+        int k,
+        int32_t* distances,
+        idx_t* labels,
+        const IVFSearchParameters* params) {
+    switch (ivf.code_size) {
+#define HANDLE_CS(cs)                                               \
+    case cs:                                                        \
+        search_knn_hamming_count<HammingComputer##cs, store_pairs>( \
+                ivf, nx, x, keys, k, distances, labels, params);    \
+        break;
+        HANDLE_CS(4);
+        HANDLE_CS(8);
+        HANDLE_CS(16);
+        HANDLE_CS(20);
+        HANDLE_CS(32);
+        HANDLE_CS(64);
+#undef HANDLE_CS
+        default:
+            search_knn_hamming_count<HammingComputerDefault, store_pairs>(
+                    ivf, nx, x, keys, k, distances, labels, params);
+            break;
+    }
+}
+} // namespace
+BinaryInvertedListScanner* IndexBinaryIVF::get_InvertedListScanner(
+        bool store_pairs) const {
+#define HC(name) return new IVFBinaryScannerL2<name>(code_size, store_pairs)
+    switch (code_size) {
+        case 4:
+            HC(HammingComputer4);
+        case 8:
+            HC(HammingComputer8);
+        case 16:
+            HC(HammingComputer16);
+        case 20:
+            HC(HammingComputer20);
+        case 32:
+            HC(HammingComputer32);
+        case 64:
+            HC(HammingComputer64);
+        default:
+            HC(HammingComputerDefault);
+    }
+#undef HC
+}
+void IndexBinaryIVF::search_preassigned(
+        idx_t n,
+        const uint8_t* x,
+        idx_t k,
+        const idx_t* idx,
+        const int32_t* coarse_dis,
+        int32_t* distances,
+        idx_t* labels,
+        bool store_pairs,
+        const IVFSearchParameters* params) const {
+    if (use_heap) {
+        search_knn_hamming_heap(
+                *this,
+                n,
+                x,
+                k,
+                idx,
+                coarse_dis,
+                distances,
+                labels,
+                store_pairs,
+                params);
+    } else {
+        if (store_pairs) {
+            search_knn_hamming_count_1<true>(
+                    *this, n, x, idx, k, distances, labels, params);
+        } else {
+            search_knn_hamming_count_1<false>(
+                    *this, n, x, idx, k, distances, labels, params);
+        }
+    }
+}
+void IndexBinaryIVF::range_search(
+        idx_t n,
+        const uint8_t* x,
+        int radius,
+        RangeSearchResult* res) const {
+    const size_t nprobe = std::min(nlist, this->nprobe);
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+    double t0 = getmillisecs();
+    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+    t0 = getmillisecs();
+    invlists->prefetch_lists(idx.get(), n * nprobe);
+    range_search_preassigned(n, x, radius, idx.get(), coarse_dis.get(), res);
+    indexIVF_stats.search_time += getmillisecs() - t0;
+}
+void IndexBinaryIVF::range_search_preassigned(
+        idx_t n,
+        const uint8_t* x,
+        int radius,
+        const idx_t* assign,
+        const int32_t* centroid_dis,
+        RangeSearchResult* res) const {
+    const size_t nprobe = std::min(nlist, this->nprobe);
+    bool store_pairs = false;
+    size_t nlistv = 0, ndis = 0;
+    std::vector<RangeSearchPartialResult*> all_pres(omp_get_max_threads());
+#pragma omp parallel reduction(+ : nlistv, ndis)
+    {
+        RangeSearchPartialResult pres(res);
+        std::unique_ptr<BinaryInvertedListScanner> scanner(
+                get_InvertedListScanner(store_pairs));
+        FAISS_THROW_IF_NOT(scanner.get());
+        all_pres[omp_get_thread_num()] = &pres;
+        auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult& qres) {
+            idx_t key = assign[i * nprobe + ik]; /* select the list  */
+            if (key < 0)
+                return;
+            FAISS_THROW_IF_NOT_FMT(
+                    key < (idx_t)nlist,
+                    "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
+                    key,
+                    ik,
+                    nlist);
+            const size_t list_size = invlists->list_size(key);
+            if (list_size == 0)
+                return;
+            InvertedLists::ScopedCodes scodes(invlists, key);
+            InvertedLists::ScopedIds ids(invlists, key);
+            scanner->set_list(key, assign[i * nprobe + ik]);
+            nlistv++;
+            ndis += list_size;
+            scanner->scan_codes_range(
+                    list_size, scodes.get(), ids.get(), radius, qres);
+        };
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            scanner->set_query(x + i * code_size);
+            RangeQueryResult& qres = pres.new_result(i);
+            for (size_t ik = 0; ik < nprobe; ik++) {
+                scan_list_func(i, ik, qres);
+            }
+        }
+        pres.finalize();
+    }
+    indexIVF_stats.nq += n;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+}
+IndexBinaryIVF::~IndexBinaryIVF() {
+    if (own_invlists) {
+        delete invlists;
+    }
+    if (own_fields) {
+        delete quantizer;
+    }
+}
+} // namespace faiss
--- a/faiss/IndexBinaryIVF.h
+++ b/faiss/IndexBinaryIVF.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_INDEX_BINARY_IVF_H
+#define FAISS_INDEX_BINARY_IVF_H
+#include <vector>
+#include <faiss/Clustering.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/utils/Heap.h>
+namespace faiss {
+struct BinaryInvertedListScanner;
+/** Index based on a inverted file (IVF)
+ *
+ * In the inverted file, the quantizer (an IndexBinary instance) provides a
+ * quantization index for each vector to be added. The quantization
+ * index maps to a list (aka inverted list or posting list), where the
+ * id of the vector is stored.
+ *
+ * Otherwise the object is similar to the IndexIVF
+ */
+struct IndexBinaryIVF : IndexBinary {
+    /// Access to the actual data
+    InvertedLists* invlists;
+    bool own_invlists;
+    size_t nprobe;    ///< number of probes at query time
+    size_t max_codes; ///< max nb of codes to visit to do a query
+    /** Select between using a heap or counting to select the k smallest values
+     * when scanning inverted lists.
+     */
+    bool use_heap = true;
+    /// map for direct access to the elements. Enables reconstruct().
+    DirectMap direct_map;
+    IndexBinary* quantizer; ///< quantizer that maps vectors to inverted lists
+    size_t nlist;           ///< number of possible key values
+    bool own_fields; ///< whether object owns the quantizer
+    ClusteringParameters cp; ///< to override default clustering params
+    Index* clustering_index; ///< to override index used during clustering
+    /** The Inverted file takes a quantizer (an IndexBinary) on input,
+     * which implements the function mapping a vector to a list
+     * identifier. The pointer is borrowed: the quantizer should not
+     * be deleted while the IndexBinaryIVF is in use.
+     */
+    IndexBinaryIVF(IndexBinary* quantizer, size_t d, size_t nlist);
+    IndexBinaryIVF();
+    ~IndexBinaryIVF() override;
+    void reset() override;
+    /// Trains the quantizer
+    void train(idx_t n, const uint8_t* x) override;
+    void add(idx_t n, const uint8_t* x) override;
+    void add_with_ids(idx_t n, const uint8_t* x, const idx_t* xids) override;
+    /** Implementation of vector addition where the vector assignments are
+     * predefined.
+     *
+     * @param precomputed_idx    quantization indices for the input vectors
+     * (size n)
+     */
+    void add_core(
+            idx_t n,
+            const uint8_t* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx);
+    /** Search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the corresponding heaps with the query
+     *  results. search() calls this.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param distance
+     *               output distances, size n * k
+     * @param labels output labels, size n * k
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     */
+    void search_preassigned(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            const idx_t* assign,
+            const int32_t* centroid_dis,
+            int32_t* distances,
+            idx_t* labels,
+            bool store_pairs,
+            const IVFSearchParameters* params = nullptr) const;
+    virtual BinaryInvertedListScanner* get_InvertedListScanner(
+            bool store_pairs = false) const;
+    /** assign the vectors, then call search_preassign */
+    void search(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels) const override;
+    void range_search(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            RangeSearchResult* result) const override;
+    void range_search_preassigned(
+            idx_t n,
+            const uint8_t* x,
+            int radius,
+            const idx_t* assign,
+            const int32_t* centroid_dis,
+            RangeSearchResult* result) const;
+    void reconstruct(idx_t key, uint8_t* recons) const override;
+    /** Reconstruct a subset of the indexed vectors.
+     *
+     * Overrides default implementation to bypass reconstruct() which requires
+     * direct_map to be maintained.
+     *
+     * @param i0     first vector to reconstruct
+     * @param ni     nb of vectors to reconstruct
+     * @param recons output array of reconstructed vectors, size ni * d / 8
+     */
+    void reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const override;
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * Overrides default implementation to avoid having to maintain direct_map
+     * and instead fetch the code offsets through the `store_pairs` flag in
+     * search_preassigned().
+     *
+     * @param recons      reconstructed vectors size (n, k, d / 8)
+     */
+    void search_and_reconstruct(
+            idx_t n,
+            const uint8_t* x,
+            idx_t k,
+            int32_t* distances,
+            idx_t* labels,
+            uint8_t* recons) const override;
+    /** Reconstruct a vector given the location in terms of (inv list index +
+     * inv list offset) instead of the id.
+     *
+     * Useful for reconstructing when the direct_map is not maintained and
+     * the inv list offset is computed by search_preassigned() with
+     * `store_pairs` set.
+     */
+    virtual void reconstruct_from_offset(
+            idx_t list_no,
+            idx_t offset,
+            uint8_t* recons) const;
+    /// Dataset manipulation functions
+    size_t remove_ids(const IDSelector& sel) override;
+    /** moves the entries from another dataset to self. On output,
+     * other is empty. add_id is added to all moved ids (for
+     * sequential ids, this would be this->ntotal */
+    virtual void merge_from(IndexBinaryIVF& other, idx_t add_id);
+    size_t get_list_size(size_t list_no) const {
+        return invlists->list_size(list_no);
+    }
+    /** intialize a direct map
+     *
+     * @param new_maintain_direct_map    if true, create a direct map,
+     *                                   else clear it
+     */
+    void make_direct_map(bool new_maintain_direct_map = true);
+    void set_direct_map_type(DirectMap::Type type);
+    void replace_invlists(InvertedLists* il, bool own = false);
+};
+struct BinaryInvertedListScanner {
+    using idx_t = Index::idx_t;
+    /// from now on we handle this query.
+    virtual void set_query(const uint8_t* query_vector) = 0;
+    /// following codes come from this inverted list
+    virtual void set_list(idx_t list_no, uint8_t coarse_dis) = 0;
+    /// compute a single query-to-code distance
+    virtual uint32_t distance_to_code(const uint8_t* code) const = 0;
+    /** compute the distances to codes. (distances, labels) should be
+     * organized as a min- or max-heap
+     *
+     * @param n      number of codes to scan
+     * @param codes  codes to scan (n * code_size)
+     * @param ids        corresponding ids (ignored if store_pairs)
+     * @param distances  heap distances (size k)
+     * @param labels     heap labels (size k)
+     * @param k          heap size
+     */
+    virtual size_t scan_codes(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            int32_t* distances,
+            idx_t* labels,
+            size_t k) const = 0;
+    virtual void scan_codes_range(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            int radius,
+            RangeQueryResult& result) const = 0;
+    virtual ~BinaryInvertedListScanner() {}
+};
+} // namespace faiss
+#endif // FAISS_INDEX_BINARY_IVF_H