init the faiss for rocm

395d2ce6 · huchen · 5ded39f5 · 395d2ce6 · 395d2ce6 · 395d2ce6
Commit 395d2ce6 authored Jun 01, 2022 by huchen
20 changed files
--- a/faiss/IndexFlat.cpp
+++ b/faiss/IndexFlat.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/extra_distances.h>
+#include <faiss/utils/utils.h>
+#include <cstring>
+namespace faiss {
+IndexFlat::IndexFlat(idx_t d, MetricType metric)
+        : IndexFlatCodes(sizeof(float) * d, d, metric) {}
+void IndexFlat::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    // we see the distances and labels as heaps
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        float_minheap_array_t res = {size_t(n), size_t(k), labels, distances};
+        knn_inner_product(x, get_xb(), d, n, ntotal, &res);
+    } else if (metric_type == METRIC_L2) {
+        float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
+        knn_L2sqr(x, get_xb(), d, n, ntotal, &res);
+    } else {
+        float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
+        knn_extra_metrics(
+                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
+    }
+}
+void IndexFlat::range_search(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result) const {
+    switch (metric_type) {
+        case METRIC_INNER_PRODUCT:
+            range_search_inner_product(
+                    x, get_xb(), d, n, ntotal, radius, result);
+            break;
+        case METRIC_L2:
+            range_search_L2sqr(x, get_xb(), d, n, ntotal, radius, result);
+            break;
+        default:
+            FAISS_THROW_MSG("metric type not supported");
+    }
+}
+void IndexFlat::compute_distance_subset(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        const idx_t* labels) const {
+    switch (metric_type) {
+        case METRIC_INNER_PRODUCT:
+            fvec_inner_products_by_idx(distances, x, get_xb(), labels, d, n, k);
+            break;
+        case METRIC_L2:
+            fvec_L2sqr_by_idx(distances, x, get_xb(), labels, d, n, k);
+            break;
+        default:
+            FAISS_THROW_MSG("metric type not supported");
+    }
+}
+namespace {
+struct FlatL2Dis : DistanceComputer {
+    size_t d;
+    Index::idx_t nb;
+    const float* q;
+    const float* b;
+    size_t ndis;
+    float operator()(idx_t i) override {
+        ndis++;
+        return fvec_L2sqr(q, b + i * d, d);
+    }
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return fvec_L2sqr(b + j * d, b + i * d, d);
+    }
+    explicit FlatL2Dis(const IndexFlat& storage, const float* q = nullptr)
+            : d(storage.d),
+              nb(storage.ntotal),
+              q(q),
+              b(storage.get_xb()),
+              ndis(0) {}
+    void set_query(const float* x) override {
+        q = x;
+    }
+};
+struct FlatIPDis : DistanceComputer {
+    size_t d;
+    Index::idx_t nb;
+    const float* q;
+    const float* b;
+    size_t ndis;
+    float operator()(idx_t i) override {
+        ndis++;
+        return fvec_inner_product(q, b + i * d, d);
+    }
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return fvec_inner_product(b + j * d, b + i * d, d);
+    }
+    explicit FlatIPDis(const IndexFlat& storage, const float* q = nullptr)
+            : d(storage.d),
+              nb(storage.ntotal),
+              q(q),
+              b(storage.get_xb()),
+              ndis(0) {}
+    void set_query(const float* x) override {
+        q = x;
+    }
+};
+} // namespace
+DistanceComputer* IndexFlat::get_distance_computer() const {
+    if (metric_type == METRIC_L2) {
+        return new FlatL2Dis(*this);
+    } else if (metric_type == METRIC_INNER_PRODUCT) {
+        return new FlatIPDis(*this);
+    } else {
+        return get_extra_distance_computer(
+                d, metric_type, metric_arg, ntotal, get_xb());
+    }
+}
+void IndexFlat::reconstruct(idx_t key, float* recons) const {
+    memcpy(recons, &(codes[key * code_size]), code_size);
+}
+void IndexFlat::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
+    if (n > 0) {
+        memcpy(bytes, x, sizeof(float) * d * n);
+    }
+}
+void IndexFlat::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
+    if (n > 0) {
+        memcpy(x, bytes, sizeof(float) * d * n);
+    }
+}
+/***************************************************
+ * IndexFlat1D
+ ***************************************************/
+IndexFlat1D::IndexFlat1D(bool continuous_update)
+        : IndexFlatL2(1), continuous_update(continuous_update) {}
+/// if not continuous_update, call this between the last add and
+/// the first search
+void IndexFlat1D::update_permutation() {
+    perm.resize(ntotal);
+    if (ntotal < 1000000) {
+        fvec_argsort(ntotal, get_xb(), (size_t*)perm.data());
+    } else {
+        fvec_argsort_parallel(ntotal, get_xb(), (size_t*)perm.data());
+    }
+}
+void IndexFlat1D::add(idx_t n, const float* x) {
+    IndexFlatL2::add(n, x);
+    if (continuous_update)
+        update_permutation();
+}
+void IndexFlat1D::reset() {
+    IndexFlatL2::reset();
+    perm.clear();
+}
+void IndexFlat1D::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    FAISS_THROW_IF_NOT_MSG(
+            perm.size() == ntotal, "Call update_permutation before search");
+    const float* xb = get_xb();
+#pragma omp parallel for
+    for (idx_t i = 0; i < n; i++) {
+        float q = x[i]; // query
+        float* D = distances + i * k;
+        idx_t* I = labels + i * k;
+        // binary search
+        idx_t i0 = 0, i1 = ntotal;
+        idx_t wp = 0;
+        if (xb[perm[i0]] > q) {
+            i1 = 0;
+            goto finish_right;
+        }
+        if (xb[perm[i1 - 1]] <= q) {
+            i0 = i1 - 1;
+            goto finish_left;
+        }
+        while (i0 + 1 < i1) {
+            idx_t imed = (i0 + i1) / 2;
+            if (xb[perm[imed]] <= q)
+                i0 = imed;
+            else
+                i1 = imed;
+        }
+        // query is between xb[perm[i0]] and xb[perm[i1]]
+        // expand to nearest neighs
+        while (wp < k) {
+            float xleft = xb[perm[i0]];
+            float xright = xb[perm[i1]];
+            if (q - xleft < xright - q) {
+                D[wp] = q - xleft;
+                I[wp] = perm[i0];
+                i0--;
+                wp++;
+                if (i0 < 0) {
+                    goto finish_right;
+                }
+            } else {
+                D[wp] = xright - q;
+                I[wp] = perm[i1];
+                i1++;
+                wp++;
+                if (i1 >= ntotal) {
+                    goto finish_left;
+                }
+            }
+        }
+        goto done;
+    finish_right:
+        // grow to the right from i1
+        while (wp < k) {
+            if (i1 < ntotal) {
+                D[wp] = xb[perm[i1]] - q;
+                I[wp] = perm[i1];
+                i1++;
+            } else {
+                D[wp] = std::numeric_limits<float>::infinity();
+                I[wp] = -1;
+            }
+            wp++;
+        }
+        goto done;
+    finish_left:
+        // grow to the left from i0
+        while (wp < k) {
+            if (i0 >= 0) {
+                D[wp] = q - xb[perm[i0]];
+                I[wp] = perm[i0];
+                i0--;
+            } else {
+                D[wp] = std::numeric_limits<float>::infinity();
+                I[wp] = -1;
+            }
+            wp++;
+        }
+    done:;
+    }
+}
+} // namespace faiss
--- a/faiss/IndexFlat.h
+++ b/faiss/IndexFlat.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef INDEX_FLAT_H
+#define INDEX_FLAT_H
+#include <vector>
+#include <faiss/IndexFlatCodes.h>
+namespace faiss {
+/** Index that stores the full vectors and performs exhaustive search */
+struct IndexFlat : IndexFlatCodes {
+    explicit IndexFlat(idx_t d, MetricType metric = METRIC_L2);
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+    void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result) const override;
+    void reconstruct(idx_t key, float* recons) const override;
+    /** compute distance with a subset of vectors
+     *
+     * @param x       query vectors, size n * d
+     * @param labels  indices of the vectors that should be compared
+     *                for each query vector, size n * k
+     * @param distances
+     *                corresponding output distances, size n * k
+     */
+    void compute_distance_subset(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            const idx_t* labels) const;
+    // get pointer to the floating point data
+    float* get_xb() {
+        return (float*)codes.data();
+    }
+    const float* get_xb() const {
+        return (const float*)codes.data();
+    }
+    IndexFlat() {}
+    DistanceComputer* get_distance_computer() const override;
+    /* The stanadlone codec interface (just memcopies in this case) */
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+struct IndexFlatIP : IndexFlat {
+    explicit IndexFlatIP(idx_t d) : IndexFlat(d, METRIC_INNER_PRODUCT) {}
+    IndexFlatIP() {}
+};
+struct IndexFlatL2 : IndexFlat {
+    explicit IndexFlatL2(idx_t d) : IndexFlat(d, METRIC_L2) {}
+    IndexFlatL2() {}
+};
+/// optimized version for 1D "vectors".
+struct IndexFlat1D : IndexFlatL2 {
+    bool continuous_update; ///< is the permutation updated continuously?
+    std::vector<idx_t> perm; ///< sorted database indices
+    explicit IndexFlat1D(bool continuous_update = true);
+    /// if not continuous_update, call this between the last add and
+    /// the first search
+    void update_permutation();
+    void add(idx_t n, const float* x) override;
+    void reset() override;
+    /// Warn: the distances returned are L1 not L2
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+};
+} // namespace faiss
+#endif
--- a/faiss/IndexFlatCodes.cpp
+++ b/faiss/IndexFlatCodes.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <faiss/IndexFlatCodes.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+namespace faiss {
+IndexFlatCodes::IndexFlatCodes(size_t code_size, idx_t d, MetricType metric)
+        : Index(d, metric), code_size(code_size) {}
+IndexFlatCodes::IndexFlatCodes() : code_size(0) {}
+void IndexFlatCodes::add(idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT(is_trained);
+    codes.resize((ntotal + n) * code_size);
+    sa_encode(n, x, &codes[ntotal * code_size]);
+    ntotal += n;
+}
+void IndexFlatCodes::reset() {
+    codes.clear();
+    ntotal = 0;
+}
+size_t IndexFlatCodes::sa_code_size() const {
+    return code_size;
+}
+size_t IndexFlatCodes::remove_ids(const IDSelector& sel) {
+    idx_t j = 0;
+    for (idx_t i = 0; i < ntotal; i++) {
+        if (sel.is_member(i)) {
+            // should be removed
+        } else {
+            if (i > j) {
+                memmove(&codes[code_size * j],
+                        &codes[code_size * i],
+                        code_size);
+            }
+            j++;
+        }
+    }
+    size_t nremove = ntotal - j;
+    if (nremove > 0) {
+        ntotal = j;
+        codes.resize(ntotal * code_size);
+    }
+    return nremove;
+}
+void IndexFlatCodes::reconstruct_n(idx_t i0, idx_t ni, float* recons) const {
+    FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
+    sa_decode(ni, codes.data() + i0 * code_size, recons);
+}
+void IndexFlatCodes::reconstruct(idx_t key, float* recons) const {
+    reconstruct_n(key, 1, recons);
+}
+} // namespace faiss
--- a/faiss/IndexFlatCodes.h
+++ b/faiss/IndexFlatCodes.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#pragma once
+#include <faiss/Index.h>
+#include <vector>
+namespace faiss {
+/** Index that encodes all vectors as fixed-size codes (size code_size). Storage
+ * is in the codes vector */
+struct IndexFlatCodes : Index {
+    size_t code_size;
+    /// encoded dataset, size ntotal * code_size
+    std::vector<uint8_t> codes;
+    IndexFlatCodes();
+    IndexFlatCodes(size_t code_size, idx_t d, MetricType metric = METRIC_L2);
+    /// default add uses sa_encode
+    void add(idx_t n, const float* x) override;
+    void reset() override;
+    /// reconstruction using the codec interface
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+    void reconstruct(idx_t key, float* recons) const override;
+    size_t sa_code_size() const override;
+    /** remove some ids. NB that Because of the structure of the
+     * indexing structure, the semantics of this operation are
+     * different from the usual ones: the new ids are shifted */
+    size_t remove_ids(const IDSelector& sel) override;
+};
+} // namespace faiss
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexHNSW.h>
+#include <omp.h>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <queue>
+#include <unordered_set>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#ifdef __SSE__
+#endif
+#include <faiss/Index2Layer.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/random.h>
+extern "C" {
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+int sgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const float* alpha,
+        const float* a,
+        FINTEGER* lda,
+        const float* b,
+        FINTEGER* ldb,
+        float* beta,
+        float* c,
+        FINTEGER* ldc);
+}
+namespace faiss {
+using idx_t = Index::idx_t;
+using MinimaxHeap = HNSW::MinimaxHeap;
+using storage_idx_t = HNSW::storage_idx_t;
+using NodeDistFarther = HNSW::NodeDistFarther;
+HNSWStats hnsw_stats;
+/**************************************************************
+ * add / search blocks of descriptors
+ **************************************************************/
+namespace {
+/* Wrap the distance computer into one that negates the
+   distances. This makes supporting INNER_PRODUCE search easier */
+struct NegativeDistanceComputer : DistanceComputer {
+    /// owned by this
+    DistanceComputer* basedis;
+    explicit NegativeDistanceComputer(DistanceComputer* basedis)
+            : basedis(basedis) {}
+    void set_query(const float* x) override {
+        basedis->set_query(x);
+    }
+    /// compute distance of vector i to current query
+    float operator()(idx_t i) override {
+        return -(*basedis)(i);
+    }
+    /// compute distance between two stored vectors
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return -basedis->symmetric_dis(i, j);
+    }
+    virtual ~NegativeDistanceComputer() {
+        delete basedis;
+    }
+};
+DistanceComputer* storage_distance_computer(const Index* storage) {
+    if (storage->metric_type == METRIC_INNER_PRODUCT) {
+        return new NegativeDistanceComputer(storage->get_distance_computer());
+    } else {
+        return storage->get_distance_computer();
+    }
+}
+void hnsw_add_vertices(
+        IndexHNSW& index_hnsw,
+        size_t n0,
+        size_t n,
+        const float* x,
+        bool verbose,
+        bool preset_levels = false) {
+    size_t d = index_hnsw.d;
+    HNSW& hnsw = index_hnsw.hnsw;
+    size_t ntotal = n0 + n;
+    double t0 = getmillisecs();
+    if (verbose) {
+        printf("hnsw_add_vertices: adding %zd elements on top of %zd "
+               "(preset_levels=%d)\n",
+               n,
+               n0,
+               int(preset_levels));
+    }
+    if (n == 0) {
+        return;
+    }
+    int max_level = hnsw.prepare_level_tab(n, preset_levels);
+    if (verbose) {
+        printf("  max_level = %d\n", max_level);
+    }
+    std::vector<omp_lock_t> locks(ntotal);
+    for (int i = 0; i < ntotal; i++)
+        omp_init_lock(&locks[i]);
+    // add vectors from highest to lowest level
+    std::vector<int> hist;
+    std::vector<int> order(n);
+    { // make buckets with vectors of the same level
+        // build histogram
+        for (int i = 0; i < n; i++) {
+            storage_idx_t pt_id = i + n0;
+            int pt_level = hnsw.levels[pt_id] - 1;
+            while (pt_level >= hist.size())
+                hist.push_back(0);
+            hist[pt_level]++;
+        }
+        // accumulate
+        std::vector<int> offsets(hist.size() + 1, 0);
+        for (int i = 0; i < hist.size() - 1; i++) {
+            offsets[i + 1] = offsets[i] + hist[i];
+        }
+        // bucket sort
+        for (int i = 0; i < n; i++) {
+            storage_idx_t pt_id = i + n0;
+            int pt_level = hnsw.levels[pt_id] - 1;
+            order[offsets[pt_level]++] = pt_id;
+        }
+    }
+    idx_t check_period = InterruptCallback::get_period_hint(
+            max_level * index_hnsw.d * hnsw.efConstruction);
+    { // perform add
+        RandomGenerator rng2(789);
+        int i1 = n;
+        for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
+            int i0 = i1 - hist[pt_level];
+            if (verbose) {
+                printf("Adding %d elements at level %d\n", i1 - i0, pt_level);
+            }
+            // random permutation to get rid of dataset order bias
+            for (int j = i0; j < i1; j++)
+                std::swap(order[j], order[j + rng2.rand_int(i1 - j)]);
+            bool interrupt = false;
+#pragma omp parallel if (i1 > i0 + 100)
+            {
+                VisitedTable vt(ntotal);
+                DistanceComputer* dis =
+                        storage_distance_computer(index_hnsw.storage);
+                ScopeDeleter1<DistanceComputer> del(dis);
+                int prev_display =
+                        verbose && omp_get_thread_num() == 0 ? 0 : -1;
+                size_t counter = 0;
+#pragma omp for schedule(dynamic)
+                for (int i = i0; i < i1; i++) {
+                    storage_idx_t pt_id = order[i];
+                    dis->set_query(x + (pt_id - n0) * d);
+                    // cannot break
+                    if (interrupt) {
+                        continue;
+                    }
+                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
+                    if (prev_display >= 0 && i - i0 > prev_display + 10000) {
+                        prev_display = i - i0;
+                        printf("  %d / %d\r", i - i0, i1 - i0);
+                        fflush(stdout);
+                    }
+                    if (counter % check_period == 0) {
+                        if (InterruptCallback::is_interrupted()) {
+                            interrupt = true;
+                        }
+                    }
+                    counter++;
+                }
+            }
+            if (interrupt) {
+                FAISS_THROW_MSG("computation interrupted");
+            }
+            i1 = i0;
+        }
+        FAISS_ASSERT(i1 == 0);
+    }
+    if (verbose) {
+        printf("Done in %.3f ms\n", getmillisecs() - t0);
+    }
+    for (int i = 0; i < ntotal; i++) {
+        omp_destroy_lock(&locks[i]);
+    }
+}
+} // namespace
+/**************************************************************
+ * IndexHNSW implementation
+ **************************************************************/
+IndexHNSW::IndexHNSW(int d, int M, MetricType metric)
+        : Index(d, metric),
+          hnsw(M),
+          own_fields(false),
+          storage(nullptr),
+          reconstruct_from_neighbors(nullptr) {}
+IndexHNSW::IndexHNSW(Index* storage, int M)
+        : Index(storage->d, storage->metric_type),
+          hnsw(M),
+          own_fields(false),
+          storage(storage),
+          reconstruct_from_neighbors(nullptr) {}
+IndexHNSW::~IndexHNSW() {
+    if (own_fields) {
+        delete storage;
+    }
+}
+void IndexHNSW::train(idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT_MSG(
+            storage,
+            "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly");
+    // hnsw structure does not require training
+    storage->train(n, x);
+    is_trained = true;
+}
+void IndexHNSW::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const
+{
+    FAISS_THROW_IF_NOT(k > 0);
+    FAISS_THROW_IF_NOT_MSG(
+            storage,
+            "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly");
+    size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0;
+    idx_t check_period = InterruptCallback::get_period_hint(
+            hnsw.max_level * d * hnsw.efSearch);
+    for (idx_t i0 = 0; i0 < n; i0 += check_period) {
+        idx_t i1 = std::min(i0 + check_period, n);
+#pragma omp parallel
+        {
+            VisitedTable vt(ntotal);
+            DistanceComputer* dis = storage_distance_computer(storage);
+            ScopeDeleter1<DistanceComputer> del(dis);
+#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder)
+            for (idx_t i = i0; i < i1; i++) {
+                idx_t* idxi = labels + i * k;
+                float* simi = distances + i * k;
+                dis->set_query(x + i * d);
+                maxheap_heapify(k, simi, idxi);
+                HNSWStats stats = hnsw.search(*dis, k, idxi, simi, vt);
+                n1 += stats.n1;
+                n2 += stats.n2;
+                n3 += stats.n3;
+                ndis += stats.ndis;
+                nreorder += stats.nreorder;
+                maxheap_reorder(k, simi, idxi);
+                if (reconstruct_from_neighbors &&
+                    reconstruct_from_neighbors->k_reorder != 0) {
+                    int k_reorder = reconstruct_from_neighbors->k_reorder;
+                    if (k_reorder == -1 || k_reorder > k)
+                        k_reorder = k;
+                    nreorder += reconstruct_from_neighbors->compute_distances(
+                            k_reorder, idxi, x + i * d, simi);
+                    // sort top k_reorder
+                    maxheap_heapify(
+                            k_reorder, simi, idxi, simi, idxi, k_reorder);
+                    maxheap_reorder(k_reorder, simi, idxi);
+                }
+            }
+        }
+        InterruptCallback::check();
+    }
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        // we need to revert the negated distances
+        for (size_t i = 0; i < k * n; i++) {
+            distances[i] = -distances[i];
+        }
+    }
+    hnsw_stats.combine({n1, n2, n3, ndis, nreorder});
+}
+void IndexHNSW::add(idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT_MSG(
+            storage,
+            "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly");
+    FAISS_THROW_IF_NOT(is_trained);
+    int n0 = ntotal;
+    storage->add(n, x);
+    ntotal = storage->ntotal;
+    hnsw_add_vertices(*this, n0, n, x, verbose, hnsw.levels.size() == ntotal);
+}
+void IndexHNSW::reset() {
+    hnsw.reset();
+    storage->reset();
+    ntotal = 0;
+}
+void IndexHNSW::reconstruct(idx_t key, float* recons) const {
+    storage->reconstruct(key, recons);
+}
+void IndexHNSW::shrink_level_0_neighbors(int new_size) {
+#pragma omp parallel
+    {
+        DistanceComputer* dis = storage_distance_computer(storage);
+        ScopeDeleter1<DistanceComputer> del(dis);
+#pragma omp for
+        for (idx_t i = 0; i < ntotal; i++) {
+            size_t begin, end;
+            hnsw.neighbor_range(i, 0, &begin, &end);
+            std::priority_queue<NodeDistFarther> initial_list;
+            for (size_t j = begin; j < end; j++) {
+                int v1 = hnsw.neighbors[j];
+                if (v1 < 0)
+                    break;
+                initial_list.emplace(dis->symmetric_dis(i, v1), v1);
+                // initial_list.emplace(qdis(v1), v1);
+            }
+            std::vector<NodeDistFarther> shrunk_list;
+            HNSW::shrink_neighbor_list(
+                    *dis, initial_list, shrunk_list, new_size);
+            for (size_t j = begin; j < end; j++) {
+                if (j - begin < shrunk_list.size())
+                    hnsw.neighbors[j] = shrunk_list[j - begin].id;
+                else
+                    hnsw.neighbors[j] = -1;
+            }
+        }
+    }
+}
+void IndexHNSW::search_level_0(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        const storage_idx_t* nearest,
+        const float* nearest_d,
+        float* distances,
+        idx_t* labels,
+        int nprobe,
+        int search_type) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    FAISS_THROW_IF_NOT(nprobe > 0);
+    storage_idx_t ntotal = hnsw.levels.size();
+    size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0;
+#pragma omp parallel
+    {
+        DistanceComputer* qdis = storage_distance_computer(storage);
+        ScopeDeleter1<DistanceComputer> del(qdis);
+        VisitedTable vt(ntotal);
+#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder)
+        for (idx_t i = 0; i < n; i++) {
+            idx_t* idxi = labels + i * k;
+            float* simi = distances + i * k;
+            qdis->set_query(x + i * d);
+            maxheap_heapify(k, simi, idxi);
+            if (search_type == 1) {
+                int nres = 0;
+                for (int j = 0; j < nprobe; j++) {
+                    storage_idx_t cj = nearest[i * nprobe + j];
+                    if (cj < 0)
+                        break;
+                    if (vt.get(cj))
+                        continue;
+                    int candidates_size = std::max(hnsw.efSearch, int(k));
+                    MinimaxHeap candidates(candidates_size);
+                    candidates.push(cj, nearest_d[i * nprobe + j]);
+                    HNSWStats search_stats;
+                    nres = hnsw.search_from_candidates(
+                            *qdis,
+                            k,
+                            idxi,
+                            simi,
+                            candidates,
+                            vt,
+                            search_stats,
+                            0,
+                            nres);
+                    n1 += search_stats.n1;
+                    n2 += search_stats.n2;
+                    n3 += search_stats.n3;
+                    ndis += search_stats.ndis;
+                    nreorder += search_stats.nreorder;
+                }
+            } else if (search_type == 2) {
+                int candidates_size = std::max(hnsw.efSearch, int(k));
+                candidates_size = std::max(candidates_size, nprobe);
+                MinimaxHeap candidates(candidates_size);
+                for (int j = 0; j < nprobe; j++) {
+                    storage_idx_t cj = nearest[i * nprobe + j];
+                    if (cj < 0)
+                        break;
+                    candidates.push(cj, nearest_d[i * nprobe + j]);
+                }
+                HNSWStats search_stats;
+                hnsw.search_from_candidates(
+                        *qdis, k, idxi, simi, candidates, vt, search_stats, 0);
+                n1 += search_stats.n1;
+                n2 += search_stats.n2;
+                n3 += search_stats.n3;
+                ndis += search_stats.ndis;
+                nreorder += search_stats.nreorder;
+            }
+            vt.advance();
+            maxheap_reorder(k, simi, idxi);
+        }
+    }
+    hnsw_stats.combine({n1, n2, n3, ndis, nreorder});
+}
+void IndexHNSW::init_level_0_from_knngraph(
+        int k,
+        const float* D,
+        const idx_t* I) {
+    int dest_size = hnsw.nb_neighbors(0);
+#pragma omp parallel for
+    for (idx_t i = 0; i < ntotal; i++) {
+        DistanceComputer* qdis = storage_distance_computer(storage);
+        std::vector<float> vec(d);
+        storage->reconstruct(i, vec.data());
+        qdis->set_query(vec.data());
+        std::priority_queue<NodeDistFarther> initial_list;
+        for (size_t j = 0; j < k; j++) {
+            int v1 = I[i * k + j];
+            if (v1 == i)
+                continue;
+            if (v1 < 0)
+                break;
+            initial_list.emplace(D[i * k + j], v1);
+        }
+        std::vector<NodeDistFarther> shrunk_list;
+        HNSW::shrink_neighbor_list(*qdis, initial_list, shrunk_list, dest_size);
+        size_t begin, end;
+        hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            if (j - begin < shrunk_list.size())
+                hnsw.neighbors[j] = shrunk_list[j - begin].id;
+            else
+                hnsw.neighbors[j] = -1;
+        }
+    }
+}
+void IndexHNSW::init_level_0_from_entry_points(
+        int n,
+        const storage_idx_t* points,
+        const storage_idx_t* nearests) {
+    std::vector<omp_lock_t> locks(ntotal);
+    for (int i = 0; i < ntotal; i++)
+        omp_init_lock(&locks[i]);
+#pragma omp parallel
+    {
+        VisitedTable vt(ntotal);
+        DistanceComputer* dis = storage_distance_computer(storage);
+        ScopeDeleter1<DistanceComputer> del(dis);
+        std::vector<float> vec(storage->d);
+#pragma omp for schedule(dynamic)
+        for (int i = 0; i < n; i++) {
+            storage_idx_t pt_id = points[i];
+            storage_idx_t nearest = nearests[i];
+            storage->reconstruct(pt_id, vec.data());
+            dis->set_query(vec.data());
+            hnsw.add_links_starting_from(
+                    *dis, pt_id, nearest, (*dis)(nearest), 0, locks.data(), vt);
+            if (verbose && i % 10000 == 0) {
+                printf("  %d / %d\r", i, n);
+                fflush(stdout);
+            }
+        }
+    }
+    if (verbose) {
+        printf("\n");
+    }
+    for (int i = 0; i < ntotal; i++)
+        omp_destroy_lock(&locks[i]);
+}
+void IndexHNSW::reorder_links() {
+    int M = hnsw.nb_neighbors(0);
+#pragma omp parallel
+    {
+        std::vector<float> distances(M);
+        std::vector<size_t> order(M);
+        std::vector<storage_idx_t> tmp(M);
+        DistanceComputer* dis = storage_distance_computer(storage);
+        ScopeDeleter1<DistanceComputer> del(dis);
+#pragma omp for
+        for (storage_idx_t i = 0; i < ntotal; i++) {
+            size_t begin, end;
+            hnsw.neighbor_range(i, 0, &begin, &end);
+            for (size_t j = begin; j < end; j++) {
+                storage_idx_t nj = hnsw.neighbors[j];
+                if (nj < 0) {
+                    end = j;
+                    break;
+                }
+                distances[j - begin] = dis->symmetric_dis(i, nj);
+                tmp[j - begin] = nj;
+            }
+            fvec_argsort(end - begin, distances.data(), order.data());
+            for (size_t j = begin; j < end; j++) {
+                hnsw.neighbors[j] = tmp[order[j - begin]];
+            }
+        }
+    }
+}
+void IndexHNSW::link_singletons() {
+    printf("search for singletons\n");
+    std::vector<bool> seen(ntotal);
+    for (size_t i = 0; i < ntotal; i++) {
+        size_t begin, end;
+        hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            storage_idx_t ni = hnsw.neighbors[j];
+            if (ni >= 0)
+                seen[ni] = true;
+        }
+    }
+    int n_sing = 0, n_sing_l1 = 0;
+    std::vector<storage_idx_t> singletons;
+    for (storage_idx_t i = 0; i < ntotal; i++) {
+        if (!seen[i]) {
+            singletons.push_back(i);
+            n_sing++;
+            if (hnsw.levels[i] > 1)
+                n_sing_l1++;
+        }
+    }
+    printf("  Found %d / %" PRId64 " singletons (%d appear in a level above)\n",
+           n_sing,
+           ntotal,
+           n_sing_l1);
+    std::vector<float> recons(singletons.size() * d);
+    for (int i = 0; i < singletons.size(); i++) {
+        FAISS_ASSERT(!"not implemented");
+    }
+}
+/**************************************************************
+ * ReconstructFromNeighbors implementation
+ **************************************************************/
+ReconstructFromNeighbors::ReconstructFromNeighbors(
+        const IndexHNSW& index,
+        size_t k,
+        size_t nsq)
+        : index(index), k(k), nsq(nsq) {
+    M = index.hnsw.nb_neighbors(0);
+    FAISS_ASSERT(k <= 256);
+    code_size = k == 1 ? 0 : nsq;
+    ntotal = 0;
+    d = index.d;
+    FAISS_ASSERT(d % nsq == 0);
+    dsub = d / nsq;
+    k_reorder = -1;
+}
+void ReconstructFromNeighbors::reconstruct(
+        storage_idx_t i,
+        float* x,
+        float* tmp) const {
+    const HNSW& hnsw = index.hnsw;
+    size_t begin, end;
+    hnsw.neighbor_range(i, 0, &begin, &end);
+    if (k == 1 || nsq == 1) {
+        const float* beta;
+        if (k == 1) {
+            beta = codebook.data();
+        } else {
+            int idx = codes[i];
+            beta = codebook.data() + idx * (M + 1);
+        }
+        float w0 = beta[0]; // weight of image itself
+        index.storage->reconstruct(i, tmp);
+        for (int l = 0; l < d; l++)
+            x[l] = w0 * tmp[l];
+        for (size_t j = begin; j < end; j++) {
+            storage_idx_t ji = hnsw.neighbors[j];
+            if (ji < 0)
+                ji = i;
+            float w = beta[j - begin + 1];
+            index.storage->reconstruct(ji, tmp);
+            for (int l = 0; l < d; l++)
+                x[l] += w * tmp[l];
+        }
+    } else if (nsq == 2) {
+        int idx0 = codes[2 * i];
+        int idx1 = codes[2 * i + 1];
+        const float* beta0 = codebook.data() + idx0 * (M + 1);
+        const float* beta1 = codebook.data() + (idx1 + k) * (M + 1);
+        index.storage->reconstruct(i, tmp);
+        float w0;
+        w0 = beta0[0];
+        for (int l = 0; l < dsub; l++)
+            x[l] = w0 * tmp[l];
+        w0 = beta1[0];
+        for (int l = dsub; l < d; l++)
+            x[l] = w0 * tmp[l];
+        for (size_t j = begin; j < end; j++) {
+            storage_idx_t ji = hnsw.neighbors[j];
+            if (ji < 0)
+                ji = i;
+            index.storage->reconstruct(ji, tmp);
+            float w;
+            w = beta0[j - begin + 1];
+            for (int l = 0; l < dsub; l++)
+                x[l] += w * tmp[l];
+            w = beta1[j - begin + 1];
+            for (int l = dsub; l < d; l++)
+                x[l] += w * tmp[l];
+        }
+    } else {
+        std::vector<const float*> betas(nsq);
+        {
+            const float* b = codebook.data();
+            const uint8_t* c = &codes[i * code_size];
+            for (int sq = 0; sq < nsq; sq++) {
+                betas[sq] = b + (*c++) * (M + 1);
+                b += (M + 1) * k;
+            }
+        }
+        index.storage->reconstruct(i, tmp);
+        {
+            int d0 = 0;
+            for (int sq = 0; sq < nsq; sq++) {
+                float w = *(betas[sq]++);
+                int d1 = d0 + dsub;
+                for (int l = d0; l < d1; l++) {
+                    x[l] = w * tmp[l];
+                }
+                d0 = d1;
+            }
+        }
+        for (size_t j = begin; j < end; j++) {
+            storage_idx_t ji = hnsw.neighbors[j];
+            if (ji < 0)
+                ji = i;
+            index.storage->reconstruct(ji, tmp);
+            int d0 = 0;
+            for (int sq = 0; sq < nsq; sq++) {
+                float w = *(betas[sq]++);
+                int d1 = d0 + dsub;
+                for (int l = d0; l < d1; l++) {
+                    x[l] += w * tmp[l];
+                }
+                d0 = d1;
+            }
+        }
+    }
+}
+void ReconstructFromNeighbors::reconstruct_n(
+        storage_idx_t n0,
+        storage_idx_t ni,
+        float* x) const {
+#pragma omp parallel
+    {
+        std::vector<float> tmp(index.d);
+#pragma omp for
+        for (storage_idx_t i = 0; i < ni; i++) {
+            reconstruct(n0 + i, x + i * index.d, tmp.data());
+        }
+    }
+}
+size_t ReconstructFromNeighbors::compute_distances(
+        size_t n,
+        const idx_t* shortlist,
+        const float* query,
+        float* distances) const {
+    std::vector<float> tmp(2 * index.d);
+    size_t ncomp = 0;
+    for (int i = 0; i < n; i++) {
+        if (shortlist[i] < 0)
+            break;
+        reconstruct(shortlist[i], tmp.data(), tmp.data() + index.d);
+        distances[i] = fvec_L2sqr(query, tmp.data(), index.d);
+        ncomp++;
+    }
+    return ncomp;
+}
+void ReconstructFromNeighbors::get_neighbor_table(storage_idx_t i, float* tmp1)
+        const {
+    const HNSW& hnsw = index.hnsw;
+    size_t begin, end;
+    hnsw.neighbor_range(i, 0, &begin, &end);
+    size_t d = index.d;
+    index.storage->reconstruct(i, tmp1);
+    for (size_t j = begin; j < end; j++) {
+        storage_idx_t ji = hnsw.neighbors[j];
+        if (ji < 0)
+            ji = i;
+        index.storage->reconstruct(ji, tmp1 + (j - begin + 1) * d);
+    }
+}
+/// called by add_codes
+void ReconstructFromNeighbors::estimate_code(
+        const float* x,
+        storage_idx_t i,
+        uint8_t* code) const {
+    // fill in tmp table with the neighbor values
+    float* tmp1 = new float[d * (M + 1) + (d * k)];
+    float* tmp2 = tmp1 + d * (M + 1);
+    ScopeDeleter<float> del(tmp1);
+    // collect coordinates of base
+    get_neighbor_table(i, tmp1);
+    for (size_t sq = 0; sq < nsq; sq++) {
+        int d0 = sq * dsub;
+        {
+            FINTEGER ki = k, di = d, m1 = M + 1;
+            FINTEGER dsubi = dsub;
+            float zero = 0, one = 1;
+            sgemm_("N",
+                   "N",
+                   &dsubi,
+                   &ki,
+                   &m1,
+                   &one,
+                   tmp1 + d0,
+                   &di,
+                   codebook.data() + sq * (m1 * k),
+                   &m1,
+                   &zero,
+                   tmp2,
+                   &dsubi);
+        }
+        float min = HUGE_VAL;
+        int argmin = -1;
+        for (size_t j = 0; j < k; j++) {
+            float dis = fvec_L2sqr(x + d0, tmp2 + j * dsub, dsub);
+            if (dis < min) {
+                min = dis;
+                argmin = j;
+            }
+        }
+        code[sq] = argmin;
+    }
+}
+void ReconstructFromNeighbors::add_codes(size_t n, const float* x) {
+    if (k == 1) { // nothing to encode
+        ntotal += n;
+        return;
+    }
+    codes.resize(codes.size() + code_size * n);
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+        estimate_code(
+                x + i * index.d,
+                ntotal + i,
+                codes.data() + (ntotal + i) * code_size);
+    }
+    ntotal += n;
+    FAISS_ASSERT(codes.size() == ntotal * code_size);
+}
+/**************************************************************
+ * IndexHNSWFlat implementation
+ **************************************************************/
+IndexHNSWFlat::IndexHNSWFlat() {
+    is_trained = true;
+}
+IndexHNSWFlat::IndexHNSWFlat(int d, int M, MetricType metric)
+        : IndexHNSW(new IndexFlat(d, metric), M) {
+    own_fields = true;
+    is_trained = true;
+}
+/**************************************************************
+ * IndexHNSWPQ implementation
+ **************************************************************/
+IndexHNSWPQ::IndexHNSWPQ() {}
+IndexHNSWPQ::IndexHNSWPQ(int d, int pq_m, int M)
+        : IndexHNSW(new IndexPQ(d, pq_m, 8), M) {
+    own_fields = true;
+    is_trained = false;
+}
+void IndexHNSWPQ::train(idx_t n, const float* x) {
+    IndexHNSW::train(n, x);
+    (dynamic_cast<IndexPQ*>(storage))->pq.compute_sdc_table();
+}
+/**************************************************************
+ * IndexHNSWSQ implementation
+ **************************************************************/
+IndexHNSWSQ::IndexHNSWSQ(
+        int d,
+        ScalarQuantizer::QuantizerType qtype,
+        int M,
+        MetricType metric)
+        : IndexHNSW(new IndexScalarQuantizer(d, qtype, metric), M) {
+    is_trained = false;
+    own_fields = true;
+}
+IndexHNSWSQ::IndexHNSWSQ() {}
+/**************************************************************
+ * IndexHNSW2Level implementation
+ **************************************************************/
+IndexHNSW2Level::IndexHNSW2Level(
+        Index* quantizer,
+        size_t nlist,
+        int m_pq,
+        int M)
+        : IndexHNSW(new Index2Layer(quantizer, nlist, m_pq), M) {
+    own_fields = true;
+    is_trained = false;
+}
+IndexHNSW2Level::IndexHNSW2Level() {}
+namespace {
+// same as search_from_candidates but uses v
+// visno -> is in result list
+// visno + 1 -> in result list + in candidates
+int search_from_candidates_2(
+        const HNSW& hnsw,
+        DistanceComputer& qdis,
+        int k,
+        idx_t* I,
+        float* D,
+        MinimaxHeap& candidates,
+        VisitedTable& vt,
+        HNSWStats& stats,
+        int level,
+        int nres_in = 0) {
+    int nres = nres_in;
+    int ndis = 0;
+    for (int i = 0; i < candidates.size(); i++) {
+        idx_t v1 = candidates.ids[i];
+        FAISS_ASSERT(v1 >= 0);
+        vt.visited[v1] = vt.visno + 1;
+    }
+    int nstep = 0;
+    while (candidates.size() > 0) {
+        float d0 = 0;
+        int v0 = candidates.pop_min(&d0);
+        size_t begin, end;
+        hnsw.neighbor_range(v0, level, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            int v1 = hnsw.neighbors[j];
+            if (v1 < 0)
+                break;
+            if (vt.visited[v1] == vt.visno + 1) {
+                // nothing to do
+            } else {
+                ndis++;
+                float d = qdis(v1);
+                candidates.push(v1, d);
+                // never seen before --> add to heap
+                if (vt.visited[v1] < vt.visno) {
+                    if (nres < k) {
+                        faiss::maxheap_push(++nres, D, I, d, v1);
+                    } else if (d < D[0]) {
+                        faiss::maxheap_replace_top(nres, D, I, d, v1);
+                    }
+                }
+                vt.visited[v1] = vt.visno + 1;
+            }
+        }
+        nstep++;
+        if (nstep > hnsw.efSearch) {
+            break;
+        }
+    }
+    stats.n1++;
+    if (candidates.size() == 0)
+        stats.n2++;
+    return nres;
+}
+} // namespace
+void IndexHNSW2Level::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    if (dynamic_cast<const Index2Layer*>(storage)) {
+        IndexHNSW::search(n, x, k, distances, labels);
+    } else { // "mixed" search
+        size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0;
+        const IndexIVFPQ* index_ivfpq =
+                dynamic_cast<const IndexIVFPQ*>(storage);
+        int nprobe = index_ivfpq->nprobe;
+        std::unique_ptr<idx_t[]> coarse_assign(new idx_t[n * nprobe]);
+        std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+        index_ivfpq->quantizer->search(
+                n, x, nprobe, coarse_dis.get(), coarse_assign.get());
+        index_ivfpq->search_preassigned(
+                n,
+                x,
+                k,
+                coarse_assign.get(),
+                coarse_dis.get(),
+                distances,
+                labels,
+                false);
+#pragma omp parallel
+        {
+            VisitedTable vt(ntotal);
+            DistanceComputer* dis = storage_distance_computer(storage);
+            ScopeDeleter1<DistanceComputer> del(dis);
+            int candidates_size = hnsw.upper_beam;
+            MinimaxHeap candidates(candidates_size);
+#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder)
+            for (idx_t i = 0; i < n; i++) {
+                idx_t* idxi = labels + i * k;
+                float* simi = distances + i * k;
+                dis->set_query(x + i * d);
+                // mark all inverted list elements as visited
+                for (int j = 0; j < nprobe; j++) {
+                    idx_t key = coarse_assign[j + i * nprobe];
+                    if (key < 0)
+                        break;
+                    size_t list_length = index_ivfpq->get_list_size(key);
+                    const idx_t* ids = index_ivfpq->invlists->get_ids(key);
+                    for (int jj = 0; jj < list_length; jj++) {
+                        vt.set(ids[jj]);
+                    }
+                }
+                candidates.clear();
+                // copy the upper_beam elements to candidates list
+                int search_policy = 2;
+                if (search_policy == 1) {
+                    for (int j = 0; j < hnsw.upper_beam && j < k; j++) {
+                        if (idxi[j] < 0)
+                            break;
+                        candidates.push(idxi[j], simi[j]);
+                        // search_from_candidates adds them back
+                        idxi[j] = -1;
+                        simi[j] = HUGE_VAL;
+                    }
+                    // reorder from sorted to heap
+                    maxheap_heapify(k, simi, idxi, simi, idxi, k);
+                    HNSWStats search_stats;
+                    hnsw.search_from_candidates(
+                            *dis,
+                            k,
+                            idxi,
+                            simi,
+                            candidates,
+                            vt,
+                            search_stats,
+                            0,
+                            k);
+                    n1 += search_stats.n1;
+                    n2 += search_stats.n2;
+                    n3 += search_stats.n3;
+                    ndis += search_stats.ndis;
+                    nreorder += search_stats.nreorder;
+                    vt.advance();
+                } else if (search_policy == 2) {
+                    for (int j = 0; j < hnsw.upper_beam && j < k; j++) {
+                        if (idxi[j] < 0)
+                            break;
+                        candidates.push(idxi[j], simi[j]);
+                    }
+                    // reorder from sorted to heap
+                    maxheap_heapify(k, simi, idxi, simi, idxi, k);
+                    HNSWStats search_stats;
+                    search_from_candidates_2(
+                            hnsw,
+                            *dis,
+                            k,
+                            idxi,
+                            simi,
+                            candidates,
+                            vt,
+                            search_stats,
+                            0,
+                            k);
+                    n1 += search_stats.n1;
+                    n2 += search_stats.n2;
+                    n3 += search_stats.n3;
+                    ndis += search_stats.ndis;
+                    nreorder += search_stats.nreorder;
+                    vt.advance();
+                    vt.advance();
+                }
+                maxheap_reorder(k, simi, idxi);
+            }
+        }
+        hnsw_stats.combine({n1, n2, n3, ndis, nreorder});
+    }
+}
+void IndexHNSW2Level::flip_to_ivf() {
+    Index2Layer* storage2l = dynamic_cast<Index2Layer*>(storage);
+    FAISS_THROW_IF_NOT(storage2l);
+    IndexIVFPQ* index_ivfpq = new IndexIVFPQ(
+            storage2l->q1.quantizer,
+            d,
+            storage2l->q1.nlist,
+            storage2l->pq.M,
+            8);
+    index_ivfpq->pq = storage2l->pq;
+    index_ivfpq->is_trained = storage2l->is_trained;
+    index_ivfpq->precompute_table();
+    index_ivfpq->own_fields = storage2l->q1.own_fields;
+    storage2l->transfer_to_IVFPQ(*index_ivfpq);
+    index_ivfpq->make_direct_map(true);
+    storage = index_ivfpq;
+    delete storage2l;
+}
+} // namespace faiss
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#pragma once
+#include <vector>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/impl/HNSW.h>
+#include <faiss/utils/utils.h>
+namespace faiss {
+struct IndexHNSW;
+struct ReconstructFromNeighbors {
+    typedef Index::idx_t idx_t;
+    typedef HNSW::storage_idx_t storage_idx_t;
+    const IndexHNSW& index;
+    size_t M;   // number of neighbors
+    size_t k;   // number of codebook entries
+    size_t nsq; // number of subvectors
+    size_t code_size;
+    int k_reorder; // nb to reorder. -1 = all
+    std::vector<float> codebook; // size nsq * k * (M + 1)
+    std::vector<uint8_t> codes; // size ntotal * code_size
+    size_t ntotal;
+    size_t d, dsub; // derived values
+    explicit ReconstructFromNeighbors(
+            const IndexHNSW& index,
+            size_t k = 256,
+            size_t nsq = 1);
+    /// codes must be added in the correct order and the IndexHNSW
+    /// must be populated and sorted
+    void add_codes(size_t n, const float* x);
+    size_t compute_distances(
+            size_t n,
+            const idx_t* shortlist,
+            const float* query,
+            float* distances) const;
+    /// called by add_codes
+    void estimate_code(const float* x, storage_idx_t i, uint8_t* code) const;
+    /// called by compute_distances
+    void reconstruct(storage_idx_t i, float* x, float* tmp) const;
+    void reconstruct_n(storage_idx_t n0, storage_idx_t ni, float* x) const;
+    /// get the M+1 -by-d table for neighbor coordinates for vector i
+    void get_neighbor_table(storage_idx_t i, float* out) const;
+};
+/** The HNSW index is a normal random-access index with a HNSW
+ * link structure built on top */
+struct IndexHNSW : Index {
+    typedef HNSW::storage_idx_t storage_idx_t;
+    // the link strcuture
+    HNSW hnsw;
+    // the sequential storage
+    bool own_fields;
+    Index* storage;
+    ReconstructFromNeighbors* reconstruct_from_neighbors;
+    explicit IndexHNSW(int d = 0, int M = 32, MetricType metric = METRIC_L2);
+    explicit IndexHNSW(Index* storage, int M = 32);
+    ~IndexHNSW() override;
+    void add(idx_t n, const float* x) override;
+    /// Trains the storage if needed
+    void train(idx_t n, const float* x) override;
+    /// entry point for search
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+    void reconstruct(idx_t key, float* recons) const override;
+    void reset() override;
+    void shrink_level_0_neighbors(int size);
+    /** Perform search only on level 0, given the starting points for
+     * each vertex.
+     *
+     * @param search_type 1:perform one search per nprobe, 2: enqueue
+     *                    all entry points
+     */
+    void search_level_0(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            const storage_idx_t* nearest,
+            const float* nearest_d,
+            float* distances,
+            idx_t* labels,
+            int nprobe = 1,
+            int search_type = 1) const;
+    /// alternative graph building
+    void init_level_0_from_knngraph(int k, const float* D, const idx_t* I);
+    /// alternative graph building
+    void init_level_0_from_entry_points(
+            int npt,
+            const storage_idx_t* points,
+            const storage_idx_t* nearests);
+    // reorder links from nearest to farthest
+    void reorder_links();
+    void link_singletons();
+};
+/** Flat index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWFlat : IndexHNSW {
+    IndexHNSWFlat();
+    IndexHNSWFlat(int d, int M, MetricType metric = METRIC_L2);
+};
+/** PQ index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWPQ : IndexHNSW {
+    IndexHNSWPQ();
+    IndexHNSWPQ(int d, int pq_m, int M);
+    void train(idx_t n, const float* x) override;
+};
+/** SQ index topped with with a HNSW structure to access elements
+ *  more efficiently.
+ */
+struct IndexHNSWSQ : IndexHNSW {
+    IndexHNSWSQ();
+    IndexHNSWSQ(
+            int d,
+            ScalarQuantizer::QuantizerType qtype,
+            int M,
+            MetricType metric = METRIC_L2);
+};
+/** 2-level code structure with fast random access
+ */
+struct IndexHNSW2Level : IndexHNSW {
+    IndexHNSW2Level();
+    IndexHNSW2Level(Index* quantizer, size_t nlist, int m_pq, int M);
+    void flip_to_ivf();
+    /// entry point for search
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+};
+} // namespace faiss
--- a/faiss/IndexIVF.cpp
+++ b/faiss/IndexIVF.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexIVF.h>
+#include <omp.h>
+#include <mutex>
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <memory>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+namespace faiss {
+using ScopedIds = InvertedLists::ScopedIds;
+using ScopedCodes = InvertedLists::ScopedCodes;
+/*****************************************
+ * Level1Quantizer implementation
+ ******************************************/
+Level1Quantizer::Level1Quantizer(Index* quantizer, size_t nlist)
+        : quantizer(quantizer),
+          nlist(nlist),
+          quantizer_trains_alone(0),
+          own_fields(false),
+          clustering_index(nullptr) {
+    // here we set a low # iterations because this is typically used
+    // for large clusterings (nb this is not used for the MultiIndex,
+    // for which quantizer_trains_alone = true)
+    cp.niter = 10;
+}
+Level1Quantizer::Level1Quantizer()
+        : quantizer(nullptr),
+          nlist(0),
+          quantizer_trains_alone(0),
+          own_fields(false),
+          clustering_index(nullptr) {}
+Level1Quantizer::~Level1Quantizer() {
+    if (own_fields)
+        delete quantizer;
+}
+void Level1Quantizer::train_q1(
+        size_t n,
+        const float* x,
+        bool verbose,
+        MetricType metric_type) {
+    size_t d = quantizer->d;
+    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+        if (verbose)
+            printf("IVF quantizer does not need training.\n");
+    } else if (quantizer_trains_alone == 1) {
+        if (verbose)
+            printf("IVF quantizer trains alone...\n");
+        quantizer->train(n, x);
+        quantizer->verbose = verbose;
+        FAISS_THROW_IF_NOT_MSG(
+                quantizer->ntotal == nlist,
+                "nlist not consistent with quantizer size");
+    } else if (quantizer_trains_alone == 0) {
+        if (verbose)
+            printf("Training level-1 quantizer on %zd vectors in %zdD\n", n, d);
+        Clustering clus(d, nlist, cp);
+        quantizer->reset();
+        if (clustering_index) {
+            clus.train(n, x, *clustering_index);
+            quantizer->add(nlist, clus.centroids.data());
+        } else {
+            clus.train(n, x, *quantizer);
+        }
+        quantizer->is_trained = true;
+    } else if (quantizer_trains_alone == 2) {
+        if (verbose) {
+            printf("Training L2 quantizer on %zd vectors in %zdD%s\n",
+                   n,
+                   d,
+                   clustering_index ? "(user provided index)" : "");
+        }
+        // also accept spherical centroids because in that case
+        // L2 and IP are equivalent
+        FAISS_THROW_IF_NOT(
+                metric_type == METRIC_L2 ||
+                (metric_type == METRIC_INNER_PRODUCT && cp.spherical));
+        Clustering clus(d, nlist, cp);
+        if (!clustering_index) {
+            IndexFlatL2 assigner(d);
+            clus.train(n, x, assigner);
+        } else {
+            clus.train(n, x, *clustering_index);
+        }
+        if (verbose) {
+            printf("Adding centroids to quantizer\n");
+        }
+        if (!quantizer->is_trained) {
+            if (verbose) {
+                printf("But training it first on centroids table...\n");
+            }
+            quantizer->train(nlist, clus.centroids.data());
+        }
+        quantizer->add(nlist, clus.centroids.data());
+    }
+}
+size_t Level1Quantizer::coarse_code_size() const {
+    size_t nl = nlist - 1;
+    size_t nbyte = 0;
+    while (nl > 0) {
+        nbyte++;
+        nl >>= 8;
+    }
+    return nbyte;
+}
+void Level1Quantizer::encode_listno(Index::idx_t list_no, uint8_t* code) const {
+    // little endian
+    size_t nl = nlist - 1;
+    while (nl > 0) {
+        *code++ = list_no & 0xff;
+        list_no >>= 8;
+        nl >>= 8;
+    }
+}
+Index::idx_t Level1Quantizer::decode_listno(const uint8_t* code) const {
+    size_t nl = nlist - 1;
+    int64_t list_no = 0;
+    int nbit = 0;
+    while (nl > 0) {
+        list_no |= int64_t(*code++) << nbit;
+        nbit += 8;
+        nl >>= 8;
+    }
+    FAISS_THROW_IF_NOT(list_no >= 0 && list_no < nlist);
+    return list_no;
+}
+/*****************************************
+ * IndexIVF implementation
+ ******************************************/
+IndexIVF::IndexIVF(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t code_size,
+        MetricType metric)
+        : Index(d, metric),
+          Level1Quantizer(quantizer, nlist),
+          invlists(new ArrayInvertedLists(nlist, code_size)),
+          own_invlists(true),
+          code_size(code_size),
+          nprobe(1),
+          max_codes(0),
+          parallel_mode(0) {
+    FAISS_THROW_IF_NOT(d == quantizer->d);
+    is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
+    // Spherical by default if the metric is inner_product
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        cp.spherical = true;
+    }
+}
+IndexIVF::IndexIVF()
+        : invlists(nullptr),
+          own_invlists(false),
+          code_size(0),
+          nprobe(1),
+          max_codes(0),
+          parallel_mode(0) {}
+void IndexIVF::add(idx_t n, const float* x) {
+    add_with_ids(n, x, nullptr);
+}
+void IndexIVF::add_with_ids(idx_t n, const float* x, const idx_t* xids) {
+    std::unique_ptr<idx_t[]> coarse_idx(new idx_t[n]);
+    quantizer->assign(n, x, coarse_idx.get());
+    add_core(n, x, xids, coarse_idx.get());
+}
+void IndexIVF::add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids) {
+    size_t coarse_size = coarse_code_size();
+    DirectMapAdd dm_adder(direct_map, n, xids);
+    for (idx_t i = 0; i < n; i++) {
+        const uint8_t* code = codes + (code_size + coarse_size) * i;
+        idx_t list_no = decode_listno(code);
+        idx_t id = xids ? xids[i] : ntotal + i;
+        size_t ofs = invlists->add_entry(list_no, id, code + coarse_size);
+        dm_adder.add(i, list_no, ofs);
+    }
+    ntotal += n;
+}
+void IndexIVF::add_core(
+        idx_t n,
+        const float* x,
+        const idx_t* xids,
+        const idx_t* coarse_idx) {
+    // do some blocking to avoid excessive allocs
+    idx_t bs = 65536;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(n, i0 + bs);
+            if (verbose) {
+                printf("   IndexIVF::add_with_ids %" PRId64 ":%" PRId64 "\n",
+                       i0,
+                       i1);
+            }
+            add_core(
+                    i1 - i0,
+                    x + i0 * d,
+                    xids ? xids + i0 : nullptr,
+                    coarse_idx + i0);
+        }
+        return;
+    }
+    FAISS_THROW_IF_NOT(coarse_idx);
+    FAISS_THROW_IF_NOT(is_trained);
+    direct_map.check_can_add(xids);
+    size_t nadd = 0, nminus1 = 0;
+    for (size_t i = 0; i < n; i++) {
+        if (coarse_idx[i] < 0)
+            nminus1++;
+    }
+    std::unique_ptr<uint8_t[]> flat_codes(new uint8_t[n * code_size]);
+    encode_vectors(n, x, coarse_idx, flat_codes.get());
+    DirectMapAdd dm_adder(direct_map, n, xids);
+#pragma omp parallel reduction(+ : nadd)
+    {
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+        // each thread takes care of a subset of lists
+        for (size_t i = 0; i < n; i++) {
+            idx_t list_no = coarse_idx[i];
+            if (list_no >= 0 && list_no % nt == rank) {
+                idx_t id = xids ? xids[i] : ntotal + i;
+                size_t ofs = invlists->add_entry(
+                        list_no, id, flat_codes.get() + i * code_size);
+                dm_adder.add(i, list_no, ofs);
+                nadd++;
+            } else if (rank == 0 && list_no == -1) {
+                dm_adder.add(i, -1, 0);
+            }
+        }
+    }
+    if (verbose) {
+        printf("    added %zd / %" PRId64 " vectors (%zd -1s)\n",
+               nadd,
+               n,
+               nminus1);
+    }
+    ntotal += n;
+}
+void IndexIVF::make_direct_map(bool b) {
+    if (b) {
+        direct_map.set_type(DirectMap::Array, invlists, ntotal);
+    } else {
+        direct_map.set_type(DirectMap::NoMap, invlists, ntotal);
+    }
+}
+void IndexIVF::set_direct_map_type(DirectMap::Type type) {
+    direct_map.set_type(type, invlists, ntotal);
+}
+/** It is a sad fact of software that a conceptually simple function like this
+ * becomes very complex when you factor in several ways of parallelizing +
+ * interrupt/error handling + collecting stats + min/max collection. The
+ * codepath that is used 95% of time is the one for parallel_mode = 0 */
+void IndexIVF::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    const size_t nprobe = std::min(nlist, this->nprobe);
+    FAISS_THROW_IF_NOT(nprobe > 0);
+    // search function for a subset of queries
+    auto sub_search_func = [this, k, nprobe](
+                                   idx_t n,
+                                   const float* x,
+                                   float* distances,
+                                   idx_t* labels,
+                                   IndexIVFStats* ivf_stats) {
+        std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+        std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+        double t0 = getmillisecs();
+        quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+        double t1 = getmillisecs();
+        invlists->prefetch_lists(idx.get(), n * nprobe);
+        search_preassigned(
+                n,
+                x,
+                k,
+                idx.get(),
+                coarse_dis.get(),
+                distances,
+                labels,
+                false,
+                nullptr,
+                ivf_stats);
+        double t2 = getmillisecs();
+        ivf_stats->quantization_time += t1 - t0;
+        ivf_stats->search_time += t2 - t0;
+    };
+    if ((parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT) == 0) {
+        int nt = std::min(omp_get_max_threads(), int(n));
+        std::vector<IndexIVFStats> stats(nt);
+        std::mutex exception_mutex;
+        std::string exception_string;
+#pragma omp parallel for if (nt > 1)
+        for (idx_t slice = 0; slice < nt; slice++) {
+            IndexIVFStats local_stats;
+            idx_t i0 = n * slice / nt;
+            idx_t i1 = n * (slice + 1) / nt;
+            if (i1 > i0) {
+                try {
+                    sub_search_func(
+                            i1 - i0,
+                            x + i0 * d,
+                            distances + i0 * k,
+                            labels + i0 * k,
+                            &stats[slice]);
+                } catch (const std::exception& e) {
+                    std::lock_guard<std::mutex> lock(exception_mutex);
+                    exception_string = e.what();
+                }
+            }
+        }
+        if (!exception_string.empty()) {
+            FAISS_THROW_MSG(exception_string.c_str());
+        }
+        // collect stats
+        for (idx_t slice = 0; slice < nt; slice++) {
+            indexIVF_stats.add(stats[slice]);
+        }
+    } else {
+        // handle paralellization at level below (or don't run in parallel at
+        // all)
+        sub_search_func(n, x, distances, labels, &indexIVF_stats);
+    }
+}
+void IndexIVF::search_preassigned(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        const idx_t* keys,
+        const float* coarse_dis,
+        float* distances,
+        idx_t* labels,
+        bool store_pairs,
+        const IVFSearchParameters* params,
+        IndexIVFStats* ivf_stats) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    idx_t nprobe = params ? params->nprobe : this->nprobe;
+    nprobe = std::min((idx_t)nlist, nprobe);
+    FAISS_THROW_IF_NOT(nprobe > 0);
+    idx_t max_codes = params ? params->max_codes : this->max_codes;
+    size_t nlistv = 0, ndis = 0, nheap = 0;
+    using HeapForIP = CMin<float, idx_t>;
+    using HeapForL2 = CMax<float, idx_t>;
+    bool interrupt = false;
+    std::mutex exception_mutex;
+    std::string exception_string;
+    int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
+    bool do_heap_init = !(this->parallel_mode & PARALLEL_MODE_NO_HEAP_INIT);
+    bool do_parallel = omp_get_max_threads() >= 2 &&
+            (pmode == 0           ? false
+                     : pmode == 3 ? n > 1
+                     : pmode == 1 ? nprobe > 1
+                                  : nprobe * n > 1);
+#pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis, nheap)
+    {
+        InvertedListScanner* scanner = get_InvertedListScanner(store_pairs);
+        ScopeDeleter1<InvertedListScanner> del(scanner);
+        /*****************************************************
+         * Depending on parallel_mode, there are two possible ways
+         * to organize the search. Here we define local functions
+         * that are in common between the two
+         ******************************************************/
+        // intialize + reorder a result heap
+        auto init_result = [&](float* simi, idx_t* idxi) {
+            if (!do_heap_init)
+                return;
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_heapify<HeapForIP>(k, simi, idxi);
+            } else {
+                heap_heapify<HeapForL2>(k, simi, idxi);
+            }
+        };
+        auto add_local_results = [&](const float* local_dis,
+                                     const idx_t* local_idx,
+                                     float* simi,
+                                     idx_t* idxi) {
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_addn<HeapForIP>(k, simi, idxi, local_dis, local_idx, k);
+            } else {
+                heap_addn<HeapForL2>(k, simi, idxi, local_dis, local_idx, k);
+            }
+        };
+        auto reorder_result = [&](float* simi, idx_t* idxi) {
+            if (!do_heap_init)
+                return;
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_reorder<HeapForIP>(k, simi, idxi);
+            } else {
+                heap_reorder<HeapForL2>(k, simi, idxi);
+            }
+        };
+        // single list scan using the current scanner (with query
+        // set porperly) and storing results in simi and idxi
+        auto scan_one_list = [&](idx_t key,
+                                 float coarse_dis_i,
+                                 float* simi,
+                                 idx_t* idxi) {
+            if (key < 0) {
+                // not enough centroids for multiprobe
+                return (size_t)0;
+            }
+            FAISS_THROW_IF_NOT_FMT(
+                    key < (idx_t)nlist,
+                    "Invalid key=%" PRId64 " nlist=%zd\n",
+                    key,
+                    nlist);
+            size_t list_size = invlists->list_size(key);
+            // don't waste time on empty lists
+            if (list_size == 0) {
+                return (size_t)0;
+            }
+            scanner->set_list(key, coarse_dis_i);
+            nlistv++;
+            try {
+                InvertedLists::ScopedCodes scodes(invlists, key);
+                std::unique_ptr<InvertedLists::ScopedIds> sids;
+                const Index::idx_t* ids = nullptr;
+                if (!store_pairs) {
+                    sids.reset(new InvertedLists::ScopedIds(invlists, key));
+                    ids = sids->get();
+                }
+                nheap += scanner->scan_codes(
+                        list_size, scodes.get(), ids, simi, idxi, k);
+            } catch (const std::exception& e) {
+                std::lock_guard<std::mutex> lock(exception_mutex);
+                exception_string =
+                        demangle_cpp_symbol(typeid(e).name()) + "  " + e.what();
+                interrupt = true;
+                return size_t(0);
+            }
+            return list_size;
+        };
+        /****************************************************
+         * Actual loops, depending on parallel_mode
+         ****************************************************/
+        if (pmode == 0 || pmode == 3) {
+#pragma omp for
+            for (idx_t i = 0; i < n; i++) {
+                if (interrupt) {
+                    continue;
+                }
+                // loop over queries
+                scanner->set_query(x + i * d);
+                float* simi = distances + i * k;
+                idx_t* idxi = labels + i * k;
+                init_result(simi, idxi);
+                idx_t nscan = 0;
+                // loop over probes
+                for (size_t ik = 0; ik < nprobe; ik++) {
+                    nscan += scan_one_list(
+                            keys[i * nprobe + ik],
+                            coarse_dis[i * nprobe + ik],
+                            simi,
+                            idxi);
+                    if (max_codes && nscan >= max_codes) {
+                        break;
+                    }
+                }
+                ndis += nscan;
+                reorder_result(simi, idxi);
+                if (InterruptCallback::is_interrupted()) {
+                    interrupt = true;
+                }
+            } // parallel for
+        } else if (pmode == 1) {
+            std::vector<idx_t> local_idx(k);
+            std::vector<float> local_dis(k);
+            for (size_t i = 0; i < n; i++) {
+                scanner->set_query(x + i * d);
+                init_result(local_dis.data(), local_idx.data());
+#pragma omp for schedule(dynamic)
+                for (idx_t ik = 0; ik < nprobe; ik++) {
+                    ndis += scan_one_list(
+                            keys[i * nprobe + ik],
+                            coarse_dis[i * nprobe + ik],
+                            local_dis.data(),
+                            local_idx.data());
+                    // can't do the test on max_codes
+                }
+                // merge thread-local results
+                float* simi = distances + i * k;
+                idx_t* idxi = labels + i * k;
+#pragma omp single
+                init_result(simi, idxi);
+#pragma omp barrier
+#pragma omp critical
+                {
+                    add_local_results(
+                            local_dis.data(), local_idx.data(), simi, idxi);
+                }
+#pragma omp barrier
+#pragma omp single
+                reorder_result(simi, idxi);
+            }
+        } else if (pmode == 2) {
+            std::vector<idx_t> local_idx(k);
+            std::vector<float> local_dis(k);
+#pragma omp single
+            for (int64_t i = 0; i < n; i++) {
+                init_result(distances + i * k, labels + i * k);
+            }
+#pragma omp for schedule(dynamic)
+            for (int64_t ij = 0; ij < n * nprobe; ij++) {
+                size_t i = ij / nprobe;
+                size_t j = ij % nprobe;
+                scanner->set_query(x + i * d);
+                init_result(local_dis.data(), local_idx.data());
+                ndis += scan_one_list(
+                        keys[ij],
+                        coarse_dis[ij],
+                        local_dis.data(),
+                        local_idx.data());
+#pragma omp critical
+                {
+                    add_local_results(
+                            local_dis.data(),
+                            local_idx.data(),
+                            distances + i * k,
+                            labels + i * k);
+                }
+            }
+#pragma omp single
+            for (int64_t i = 0; i < n; i++) {
+                reorder_result(distances + i * k, labels + i * k);
+            }
+        } else {
+            FAISS_THROW_FMT("parallel_mode %d not supported\n", pmode);
+        }
+    } // parallel section
+    if (interrupt) {
+        if (!exception_string.empty()) {
+            FAISS_THROW_FMT(
+                    "search interrupted with: %s", exception_string.c_str());
+        } else {
+            FAISS_THROW_MSG("computation interrupted");
+        }
+    }
+    if (ivf_stats) {
+        ivf_stats->nq += n;
+        ivf_stats->nlist += nlistv;
+        ivf_stats->ndis += ndis;
+        ivf_stats->nheap_updates += nheap;
+    }
+}
+void IndexIVF::range_search(
+        idx_t nx,
+        const float* x,
+        float radius,
+        RangeSearchResult* result) const {
+    const size_t nprobe = std::min(nlist, this->nprobe);
+    std::unique_ptr<idx_t[]> keys(new idx_t[nx * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[nx * nprobe]);
+    double t0 = getmillisecs();
+    quantizer->search(nx, x, nprobe, coarse_dis.get(), keys.get());
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+    t0 = getmillisecs();
+    invlists->prefetch_lists(keys.get(), nx * nprobe);
+    range_search_preassigned(
+            nx,
+            x,
+            radius,
+            keys.get(),
+            coarse_dis.get(),
+            result,
+            false,
+            nullptr,
+            &indexIVF_stats);
+    indexIVF_stats.search_time += getmillisecs() - t0;
+}
+void IndexIVF::range_search_preassigned(
+        idx_t nx,
+        const float* x,
+        float radius,
+        const idx_t* keys,
+        const float* coarse_dis,
+        RangeSearchResult* result,
+        bool store_pairs,
+        const IVFSearchParameters* params,
+        IndexIVFStats* stats) const {
+    idx_t nprobe = params ? params->nprobe : this->nprobe;
+    nprobe = std::min((idx_t)nlist, nprobe);
+    idx_t max_codes = params ? params->max_codes : this->max_codes;
+    size_t nlistv = 0, ndis = 0;
+    bool interrupt = false;
+    std::mutex exception_mutex;
+    std::string exception_string;
+    std::vector<RangeSearchPartialResult*> all_pres(omp_get_max_threads());
+    int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
+    // don't start parallel section if single query
+    bool do_parallel = omp_get_max_threads() >= 2 &&
+            (pmode == 3           ? false
+                     : pmode == 0 ? nx > 1
+                     : pmode == 1 ? nprobe > 1
+                                  : nprobe * nx > 1);
+#pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis)
+    {
+        RangeSearchPartialResult pres(result);
+        std::unique_ptr<InvertedListScanner> scanner(
+                get_InvertedListScanner(store_pairs));
+        FAISS_THROW_IF_NOT(scanner.get());
+        all_pres[omp_get_thread_num()] = &pres;
+        // prepare the list scanning function
+        auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult& qres) {
+            idx_t key = keys[i * nprobe + ik]; /* select the list  */
+            if (key < 0)
+                return;
+            FAISS_THROW_IF_NOT_FMT(
+                    key < (idx_t)nlist,
+                    "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
+                    key,
+                    ik,
+                    nlist);
+            const size_t list_size = invlists->list_size(key);
+            if (list_size == 0)
+                return;
+            try {
+                InvertedLists::ScopedCodes scodes(invlists, key);
+                InvertedLists::ScopedIds ids(invlists, key);
+                scanner->set_list(key, coarse_dis[i * nprobe + ik]);
+                nlistv++;
+                ndis += list_size;
+                scanner->scan_codes_range(
+                        list_size, scodes.get(), ids.get(), radius, qres);
+            } catch (const std::exception& e) {
+                std::lock_guard<std::mutex> lock(exception_mutex);
+                exception_string =
+                        demangle_cpp_symbol(typeid(e).name()) + "  " + e.what();
+                interrupt = true;
+            }
+        };
+        if (parallel_mode == 0) {
+#pragma omp for
+            for (idx_t i = 0; i < nx; i++) {
+                scanner->set_query(x + i * d);
+                RangeQueryResult& qres = pres.new_result(i);
+                for (size_t ik = 0; ik < nprobe; ik++) {
+                    scan_list_func(i, ik, qres);
+                }
+            }
+        } else if (parallel_mode == 1) {
+            for (size_t i = 0; i < nx; i++) {
+                scanner->set_query(x + i * d);
+                RangeQueryResult& qres = pres.new_result(i);
+#pragma omp for schedule(dynamic)
+                for (int64_t ik = 0; ik < nprobe; ik++) {
+                    scan_list_func(i, ik, qres);
+                }
+            }
+        } else if (parallel_mode == 2) {
+            std::vector<RangeQueryResult*> all_qres(nx);
+            RangeQueryResult* qres = nullptr;
+#pragma omp for schedule(dynamic)
+            for (idx_t iik = 0; iik < nx * (idx_t)nprobe; iik++) {
+                idx_t i = iik / (idx_t)nprobe;
+                idx_t ik = iik % (idx_t)nprobe;
+                if (qres == nullptr || qres->qno != i) {
+                    FAISS_ASSERT(!qres || i > qres->qno);
+                    qres = &pres.new_result(i);
+                    scanner->set_query(x + i * d);
+                }
+                scan_list_func(i, ik, *qres);
+            }
+        } else {
+            FAISS_THROW_FMT("parallel_mode %d not supported\n", parallel_mode);
+        }
+        if (parallel_mode == 0) {
+            pres.finalize();
+        } else {
+#pragma omp barrier
+#pragma omp single
+            RangeSearchPartialResult::merge(all_pres, false);
+#pragma omp barrier
+        }
+    }
+    if (interrupt) {
+        if (!exception_string.empty()) {
+            FAISS_THROW_FMT(
+                    "search interrupted with: %s", exception_string.c_str());
+        } else {
+            FAISS_THROW_MSG("computation interrupted");
+        }
+    }
+    if (stats) {
+        stats->nq += nx;
+        stats->nlist += nlistv;
+        stats->ndis += ndis;
+    }
+}
+InvertedListScanner* IndexIVF::get_InvertedListScanner(
+        bool /*store_pairs*/) const {
+    return nullptr;
+}
+void IndexIVF::reconstruct(idx_t key, float* recons) const {
+    idx_t lo = direct_map.get(key);
+    reconstruct_from_offset(lo_listno(lo), lo_offset(lo), recons);
+}
+void IndexIVF::reconstruct_n(idx_t i0, idx_t ni, float* recons) const {
+    FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
+    for (idx_t list_no = 0; list_no < nlist; list_no++) {
+        size_t list_size = invlists->list_size(list_no);
+        ScopedIds idlist(invlists, list_no);
+        for (idx_t offset = 0; offset < list_size; offset++) {
+            idx_t id = idlist[offset];
+            if (!(id >= i0 && id < i0 + ni)) {
+                continue;
+            }
+            float* reconstructed = recons + (id - i0) * d;
+            reconstruct_from_offset(list_no, offset, reconstructed);
+        }
+    }
+}
+/* standalone codec interface */
+size_t IndexIVF::sa_code_size() const {
+    size_t coarse_size = coarse_code_size();
+    return code_size + coarse_size;
+}
+void IndexIVF::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
+    FAISS_THROW_IF_NOT(is_trained);
+    std::unique_ptr<int64_t[]> idx(new int64_t[n]);
+    quantizer->assign(n, x, idx.get());
+    encode_vectors(n, x, idx.get(), bytes, true);
+}
+void IndexIVF::search_and_reconstruct(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        float* recons) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    const size_t nprobe = std::min(nlist, this->nprobe);
+    FAISS_THROW_IF_NOT(nprobe > 0);
+    idx_t* idx = new idx_t[n * nprobe];
+    ScopeDeleter<idx_t> del(idx);
+    float* coarse_dis = new float[n * nprobe];
+    ScopeDeleter<float> del2(coarse_dis);
+    quantizer->search(n, x, nprobe, coarse_dis, idx);
+    invlists->prefetch_lists(idx, n * nprobe);
+    // search_preassigned() with `store_pairs` enabled to obtain the list_no
+    // and offset into `codes` for reconstruction
+    search_preassigned(
+            n,
+            x,
+            k,
+            idx,
+            coarse_dis,
+            distances,
+            labels,
+            true /* store_pairs */);
+    for (idx_t i = 0; i < n; ++i) {
+        for (idx_t j = 0; j < k; ++j) {
+            idx_t ij = i * k + j;
+            idx_t key = labels[ij];
+            float* reconstructed = recons + ij * d;
+            if (key < 0) {
+                // Fill with NaNs
+                memset(reconstructed, -1, sizeof(*reconstructed) * d);
+            } else {
+                int list_no = lo_listno(key);
+                int offset = lo_offset(key);
+                // Update label to the actual id
+                labels[ij] = invlists->get_single_id(list_no, offset);
+                reconstruct_from_offset(list_no, offset, reconstructed);
+            }
+        }
+    }
+}
+void IndexIVF::reconstruct_from_offset(
+        int64_t /*list_no*/,
+        int64_t /*offset*/,
+        float* /*recons*/) const {
+    FAISS_THROW_MSG("reconstruct_from_offset not implemented");
+}
+void IndexIVF::reset() {
+    direct_map.clear();
+    invlists->reset();
+    ntotal = 0;
+}
+size_t IndexIVF::remove_ids(const IDSelector& sel) {
+    size_t nremove = direct_map.remove_ids(sel, invlists);
+    ntotal -= nremove;
+    return nremove;
+}
+void IndexIVF::update_vectors(int n, const idx_t* new_ids, const float* x) {
+    if (direct_map.type == DirectMap::Hashtable) {
+        // just remove then add
+        IDSelectorArray sel(n, new_ids);
+        size_t nremove = remove_ids(sel);
+        FAISS_THROW_IF_NOT_MSG(
+                nremove == n, "did not find all entries to remove");
+        add_with_ids(n, x, new_ids);
+        return;
+    }
+    FAISS_THROW_IF_NOT(direct_map.type == DirectMap::Array);
+    // here it is more tricky because we don't want to introduce holes
+    // in continuous range of ids
+    FAISS_THROW_IF_NOT(is_trained);
+    std::vector<idx_t> assign(n);
+    quantizer->assign(n, x, assign.data());
+    std::vector<uint8_t> flat_codes(n * code_size);
+    encode_vectors(n, x, assign.data(), flat_codes.data());
+    direct_map.update_codes(
+            invlists, n, new_ids, assign.data(), flat_codes.data());
+}
+void IndexIVF::train(idx_t n, const float* x) {
+    if (verbose)
+        printf("Training level-1 quantizer\n");
+    train_q1(n, x, verbose, metric_type);
+    if (verbose)
+        printf("Training IVF residual\n");
+    train_residual(n, x);
+    is_trained = true;
+}
+void IndexIVF::train_residual(idx_t /*n*/, const float* /*x*/) {
+    if (verbose)
+        printf("IndexIVF: no residual training\n");
+    // does nothing by default
+}
+void IndexIVF::check_compatible_for_merge(const IndexIVF& other) const {
+    // minimal sanity checks
+    FAISS_THROW_IF_NOT(other.d == d);
+    FAISS_THROW_IF_NOT(other.nlist == nlist);
+    FAISS_THROW_IF_NOT(other.code_size == code_size);
+    FAISS_THROW_IF_NOT_MSG(
+            typeid(*this) == typeid(other),
+            "can only merge indexes of the same type");
+    FAISS_THROW_IF_NOT_MSG(
+            this->direct_map.no() && other.direct_map.no(),
+            "merge direct_map not implemented");
+}
+void IndexIVF::merge_from(IndexIVF& other, idx_t add_id) {
+    check_compatible_for_merge(other);
+    invlists->merge_from(other.invlists, add_id);
+    ntotal += other.ntotal;
+    other.ntotal = 0;
+}
+void IndexIVF::replace_invlists(InvertedLists* il, bool own) {
+    if (own_invlists) {
+        delete invlists;
+        invlists = nullptr;
+    }
+    // FAISS_THROW_IF_NOT (ntotal == 0);
+    if (il) {
+        FAISS_THROW_IF_NOT(il->nlist == nlist);
+        FAISS_THROW_IF_NOT(
+                il->code_size == code_size ||
+                il->code_size == InvertedLists::INVALID_CODE_SIZE);
+    }
+    invlists = il;
+    own_invlists = own;
+}
+void IndexIVF::copy_subset_to(
+        IndexIVF& other,
+        int subset_type,
+        idx_t a1,
+        idx_t a2) const {
+    FAISS_THROW_IF_NOT(nlist == other.nlist);
+    FAISS_THROW_IF_NOT(code_size == other.code_size);
+    FAISS_THROW_IF_NOT(other.direct_map.no());
+    FAISS_THROW_IF_NOT_FMT(
+            subset_type == 0 || subset_type == 1 || subset_type == 2,
+            "subset type %d not implemented",
+            subset_type);
+    size_t accu_n = 0;
+    size_t accu_a1 = 0;
+    size_t accu_a2 = 0;
+    InvertedLists* oivf = other.invlists;
+    for (idx_t list_no = 0; list_no < nlist; list_no++) {
+        size_t n = invlists->list_size(list_no);
+        ScopedIds ids_in(invlists, list_no);
+        if (subset_type == 0) {
+            for (idx_t i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (a1 <= id && id < a2) {
+                    oivf->add_entry(
+                            list_no,
+                            invlists->get_single_id(list_no, i),
+                            ScopedCodes(invlists, list_no, i).get());
+                    other.ntotal++;
+                }
+            }
+        } else if (subset_type == 1) {
+            for (idx_t i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (id % a1 == a2) {
+                    oivf->add_entry(
+                            list_no,
+                            invlists->get_single_id(list_no, i),
+                            ScopedCodes(invlists, list_no, i).get());
+                    other.ntotal++;
+                }
+            }
+        } else if (subset_type == 2) {
+            // see what is allocated to a1 and to a2
+            size_t next_accu_n = accu_n + n;
+            size_t next_accu_a1 = next_accu_n * a1 / ntotal;
+            size_t i1 = next_accu_a1 - accu_a1;
+            size_t next_accu_a2 = next_accu_n * a2 / ntotal;
+            size_t i2 = next_accu_a2 - accu_a2;
+            for (idx_t i = i1; i < i2; i++) {
+                oivf->add_entry(
+                        list_no,
+                        invlists->get_single_id(list_no, i),
+                        ScopedCodes(invlists, list_no, i).get());
+            }
+            other.ntotal += i2 - i1;
+            accu_a1 = next_accu_a1;
+            accu_a2 = next_accu_a2;
+        }
+        accu_n += n;
+    }
+    FAISS_ASSERT(accu_n == ntotal);
+}
+IndexIVF::~IndexIVF() {
+    if (own_invlists) {
+        delete invlists;
+    }
+}
+/*************************************************************************
+ * IndexIVFStats
+ *************************************************************************/
+void IndexIVFStats::reset() {
+    memset((void*)this, 0, sizeof(*this));
+}
+void IndexIVFStats::add(const IndexIVFStats& other) {
+    nq += other.nq;
+    nlist += other.nlist;
+    ndis += other.ndis;
+    nheap_updates += other.nheap_updates;
+    quantization_time += other.quantization_time;
+    search_time += other.search_time;
+}
+IndexIVFStats indexIVF_stats;
+/*************************************************************************
+ * InvertedListScanner
+ *************************************************************************/
+size_t InvertedListScanner::scan_codes(
+        size_t list_size,
+        const uint8_t* codes,
+        const idx_t* ids,
+        float* simi,
+        idx_t* idxi,
+        size_t k) const {
+    size_t nup = 0;
+    if (!keep_max) {
+        for (size_t j = 0; j < list_size; j++) {
+            float dis = distance_to_code(codes);
+            if (dis < simi[0]) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                maxheap_replace_top(k, simi, idxi, dis, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+    } else {
+        for (size_t j = 0; j < list_size; j++) {
+            float dis = distance_to_code(codes);
+            if (dis > simi[0]) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                minheap_replace_top(k, simi, idxi, dis, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+    }
+    return nup;
+}
+void InvertedListScanner::scan_codes_range(
+        size_t list_size,
+        const uint8_t* codes,
+        const idx_t* ids,
+        float radius,
+        RangeQueryResult& res) const {
+    for (size_t j = 0; j < list_size; j++) {
+        float dis = distance_to_code(codes);
+        bool keep = !keep_max
+                ? dis < radius
+                : dis > radius; // TODO templatize to remove this test
+        if (keep) {
+            int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+            res.add(dis, id);
+        }
+        codes += code_size;
+    }
+}
+} // namespace faiss
--- a/faiss/IndexIVF.h
+++ b/faiss/IndexIVF.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_INDEX_IVF_H
+#define FAISS_INDEX_IVF_H
+#include <stdint.h>
+#include <unordered_map>
+#include <vector>
+#include <faiss/Clustering.h>
+#include <faiss/Index.h>
+#include <faiss/impl/platform_macros.h>
+#include <faiss/invlists/DirectMap.h>
+#include <faiss/invlists/InvertedLists.h>
+#include <faiss/utils/Heap.h>
+namespace faiss {
+/** Encapsulates a quantizer object for the IndexIVF
+ *
+ * The class isolates the fields that are independent of the storage
+ * of the lists (especially training)
+ */
+struct Level1Quantizer {
+    Index* quantizer; ///< quantizer that maps vectors to inverted lists
+    size_t nlist;     ///< number of possible key values
+    /**
+     * = 0: use the quantizer as index in a kmeans training
+     * = 1: just pass on the training set to the train() of the quantizer
+     * = 2: kmeans training on a flat index + add the centroids to the quantizer
+     */
+    char quantizer_trains_alone;
+    bool own_fields; ///< whether object owns the quantizer (false by default)
+    ClusteringParameters cp; ///< to override default clustering params
+    Index* clustering_index; ///< to override index used during clustering
+    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    void train_q1(
+            size_t n,
+            const float* x,
+            bool verbose,
+            MetricType metric_type);
+    /// compute the number of bytes required to store list ids
+    size_t coarse_code_size() const;
+    void encode_listno(Index::idx_t list_no, uint8_t* code) const;
+    Index::idx_t decode_listno(const uint8_t* code) const;
+    Level1Quantizer(Index* quantizer, size_t nlist);
+    Level1Quantizer();
+    ~Level1Quantizer();
+};
+struct IVFSearchParameters {
+    size_t nprobe;    ///< number of probes at query time
+    size_t max_codes; ///< max nb of codes to visit to do a query
+    IVFSearchParameters() : nprobe(1), max_codes(0) {}
+    virtual ~IVFSearchParameters() {}
+};
+struct InvertedListScanner;
+struct IndexIVFStats;
+/** Index based on a inverted file (IVF)
+ *
+ * In the inverted file, the quantizer (an Index instance) provides a
+ * quantization index for each vector to be added. The quantization
+ * index maps to a list (aka inverted list or posting list), where the
+ * id of the vector is stored.
+ *
+ * The inverted list object is required only after trainng. If none is
+ * set externally, an ArrayInvertedLists is used automatically.
+ *
+ * At search time, the vector to be searched is also quantized, and
+ * only the list corresponding to the quantization index is
+ * searched. This speeds up the search by making it
+ * non-exhaustive. This can be relaxed using multi-probe search: a few
+ * (nprobe) quantization indices are selected and several inverted
+ * lists are visited.
+ *
+ * Sub-classes implement a post-filtering of the index that refines
+ * the distance estimation from the query to databse vectors.
+ */
+struct IndexIVF : Index, Level1Quantizer {
+    /// Access to the actual data
+    InvertedLists* invlists;
+    bool own_invlists;
+    size_t code_size; ///< code size per vector in bytes
+    size_t nprobe;    ///< number of probes at query time
+    size_t max_codes; ///< max nb of codes to visit to do a query
+    /** Parallel mode determines how queries are parallelized with OpenMP
+     *
+     * 0 (default): split over queries
+     * 1: parallelize over inverted lists
+     * 2: parallelize over both
+     * 3: split over queries with a finer granularity
+     *
+     * PARALLEL_MODE_NO_HEAP_INIT: binary or with the previous to
+     * prevent the heap to be initialized and finalized
+     */
+    int parallel_mode;
+    const int PARALLEL_MODE_NO_HEAP_INIT = 1024;
+    /** optional map that maps back ids to invlist entries. This
+     *  enables reconstruct() */
+    DirectMap direct_map;
+    /** The Inverted file takes a quantizer (an Index) on input,
+     * which implements the function mapping a vector to a list
+     * identifier.
+     */
+    IndexIVF(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t code_size,
+            MetricType metric = METRIC_L2);
+    void reset() override;
+    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    void train(idx_t n, const float* x) override;
+    /// Calls add_with_ids with NULL ids
+    void add(idx_t n, const float* x) override;
+    /// default implementation that calls encode_vectors
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+    /** Implementation of vector addition where the vector assignments are
+     * predefined. The default implementation hands over the code extraction to
+     * encode_vectors.
+     *
+     * @param precomputed_idx    quantization indices for the input vectors
+     * (size n)
+     */
+    virtual void add_core(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx);
+    /** Encodes a set of vectors as they would appear in the inverted lists
+     *
+     * @param list_nos   inverted list ids as returned by the
+     *                   quantizer (size n). -1s are ignored.
+     * @param codes      output codes, size n * code_size
+     * @param include_listno
+     *                   include the list ids in the code (in this case add
+     *                   ceil(log8(nlist)) to the code size)
+     */
+    virtual void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listno = false) const = 0;
+    /** Add vectors that are computed with the standalone codec
+     *
+     * @param codes  codes to add size n * sa_code_size()
+     * @param xids   corresponding ids, size n
+     */
+    void add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids);
+    /// Sub-classes that encode the residuals can train their encoders here
+    /// does nothing by default
+    virtual void train_residual(idx_t n, const float* x);
+    /** search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the corresponding heaps with the query
+     *  results. The default implementation uses InvertedListScanners
+     *  to do the search.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param distance
+     *               output distances, size n * k
+     * @param labels output labels, size n * k
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     * @param stats  search stats to be updated (can be null)
+     */
+    virtual void search_preassigned(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            const idx_t* assign,
+            const float* centroid_dis,
+            float* distances,
+            idx_t* labels,
+            bool store_pairs,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const;
+    /** assign the vectors, then call search_preassign */
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+    void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result) const override;
+    void range_search_preassigned(
+            idx_t nx,
+            const float* x,
+            float radius,
+            const idx_t* keys,
+            const float* coarse_dis,
+            RangeSearchResult* result,
+            bool store_pairs = false,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const;
+    /** Get a scanner for this index (store_pairs means ignore labels)
+     *
+     * The default search implementation uses this to compute the distances
+     */
+    virtual InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs = false) const;
+    /** reconstruct a vector. Works only if maintain_direct_map is set to 1 or 2
+     */
+    void reconstruct(idx_t key, float* recons) const override;
+    /** Update a subset of vectors.
+     *
+     * The index must have a direct_map
+     *
+     * @param nv     nb of vectors to update
+     * @param idx    vector indices to update, size nv
+     * @param v      vectors of new values, size nv*d
+     */
+    virtual void update_vectors(int nv, const idx_t* idx, const float* v);
+    /** Reconstruct a subset of the indexed vectors.
+     *
+     * Overrides default implementation to bypass reconstruct() which requires
+     * direct_map to be maintained.
+     *
+     * @param i0     first vector to reconstruct
+     * @param ni     nb of vectors to reconstruct
+     * @param recons output array of reconstructed vectors, size ni * d
+     */
+    void reconstruct_n(idx_t i0, idx_t ni, float* recons) const override;
+    /** Similar to search, but also reconstructs the stored vectors (or an
+     * approximation in the case of lossy coding) for the search results.
+     *
+     * Overrides default implementation to avoid having to maintain direct_map
+     * and instead fetch the code offsets through the `store_pairs` flag in
+     * search_preassigned().
+     *
+     * @param recons      reconstructed vectors size (n, k, d)
+     */
+    void search_and_reconstruct(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            float* recons) const override;
+    /** Reconstruct a vector given the location in terms of (inv list index +
+     * inv list offset) instead of the id.
+     *
+     * Useful for reconstructing when the direct_map is not maintained and
+     * the inv list offset is computed by search_preassigned() with
+     * `store_pairs` set.
+     */
+    virtual void reconstruct_from_offset(
+            int64_t list_no,
+            int64_t offset,
+            float* recons) const;
+    /// Dataset manipulation functions
+    size_t remove_ids(const IDSelector& sel) override;
+    /** check that the two indexes are compatible (ie, they are
+     * trained in the same way and have the same
+     * parameters). Otherwise throw. */
+    void check_compatible_for_merge(const IndexIVF& other) const;
+    /** moves the entries from another dataset to self. On output,
+     * other is empty. add_id is added to all moved ids (for
+     * sequential ids, this would be this->ntotal */
+    virtual void merge_from(IndexIVF& other, idx_t add_id);
+    /** copy a subset of the entries index to the other index
+     *
+     * if subset_type == 0: copies ids in [a1, a2)
+     * if subset_type == 1: copies ids if id % a1 == a2
+     * if subset_type == 2: copies inverted lists such that a1
+     *                      elements are left before and a2 elements are after
+     */
+    virtual void copy_subset_to(
+            IndexIVF& other,
+            int subset_type,
+            idx_t a1,
+            idx_t a2) const;
+    ~IndexIVF() override;
+    size_t get_list_size(size_t list_no) const {
+        return invlists->list_size(list_no);
+    }
+    /** intialize a direct map
+     *
+     * @param new_maintain_direct_map    if true, create a direct map,
+     *                                   else clear it
+     */
+    void make_direct_map(bool new_maintain_direct_map = true);
+    void set_direct_map_type(DirectMap::Type type);
+    /// replace the inverted lists, old one is deallocated if own_invlists
+    void replace_invlists(InvertedLists* il, bool own = false);
+    /* The standalone codec interface (except sa_decode that is specific) */
+    size_t sa_code_size() const override;
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+    IndexIVF();
+};
+struct RangeQueryResult;
+/** Object that handles a query. The inverted lists to scan are
+ * provided externally. The object has a lot of state, but
+ * distance_to_code and scan_codes can be called in multiple
+ * threads */
+struct InvertedListScanner {
+    using idx_t = Index::idx_t;
+    idx_t list_no = -1;    ///< remember current list
+    bool keep_max = false; ///< keep maximum instead of minimum
+    /// store positions in invlists rather than labels
+    bool store_pairs = false;
+    /// used in default implementation of scan_codes
+    size_t code_size = 0;
+    /// from now on we handle this query.
+    virtual void set_query(const float* query_vector) = 0;
+    /// following codes come from this inverted list
+    virtual void set_list(idx_t list_no, float coarse_dis) = 0;
+    /// compute a single query-to-code distance
+    virtual float distance_to_code(const uint8_t* code) const = 0;
+    /** scan a set of codes, compute distances to current query and
+     * update heap of results if necessary. Default implemetation
+     * calls distance_to_code.
+     *
+     * @param n      number of codes to scan
+     * @param codes  codes to scan (n * code_size)
+     * @param ids        corresponding ids (ignored if store_pairs)
+     * @param distances  heap distances (size k)
+     * @param labels     heap labels (size k)
+     * @param k          heap size
+     * @return number of heap updates performed
+     */
+    virtual size_t scan_codes(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float* distances,
+            idx_t* labels,
+            size_t k) const;
+    /** scan a set of codes, compute distances to current query and
+     * update results if distances are below radius
+     *
+     * (default implementation fails) */
+    virtual void scan_codes_range(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float radius,
+            RangeQueryResult& result) const;
+    virtual ~InvertedListScanner() {}
+};
+struct IndexIVFStats {
+    size_t nq;                // nb of queries run
+    size_t nlist;             // nb of inverted lists scanned
+    size_t ndis;              // nb of distances computed
+    size_t nheap_updates;     // nb of times the heap was updated
+    double quantization_time; // time spent quantizing vectors (in ms)
+    double search_time;       // time spent searching lists (in ms)
+    IndexIVFStats() {
+        reset();
+    }
+    void reset();
+    void add(const IndexIVFStats& other);
+};
+// global var that collects them all
+FAISS_API extern IndexIVFStats indexIVF_stats;
+} // namespace faiss
+#endif
--- a/faiss/IndexIVFAdditiveQuantizer.cpp
+++ b/faiss/IndexIVFAdditiveQuantizer.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// quiet the noise
+// XXclang-format off
+#include <faiss/IndexIVFAdditiveQuantizer.h>
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/ResidualQuantizer.h>
+#include <faiss/impl/ResultHandler.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/extra_distances.h>
+#include <faiss/utils/utils.h>
+namespace faiss {
+/**************************************************************************************
+ * IndexIVFAdditiveQuantizer
+ **************************************************************************************/
+IndexIVFAdditiveQuantizer::IndexIVFAdditiveQuantizer(
+        AdditiveQuantizer* aq,
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        MetricType metric)
+        : IndexIVF(quantizer, d, nlist, 0, metric), aq(aq) {
+    by_residual = true;
+}
+IndexIVFAdditiveQuantizer::IndexIVFAdditiveQuantizer(AdditiveQuantizer* aq)
+        : IndexIVF(), aq(aq) {}
+void IndexIVFAdditiveQuantizer::train_residual(idx_t n, const float* x) {
+    const float* x_in = x;
+    size_t max_train_points = 1024 * ((size_t)1 << aq->nbits[0]);
+    x = fvecs_maybe_subsample(
+            d, (size_t*)&n, max_train_points, x, verbose, 1234);
+    ScopeDeleter1<float> del_x(x_in == x ? nullptr : x);
+    if (by_residual) {
+        std::vector<Index::idx_t> idx(n);
+        quantizer->assign(n, x, idx.data());
+        std::vector<float> residuals(n * d);
+        quantizer->compute_residual_n(n, x, residuals.data(), idx.data());
+        aq->train(n, residuals.data());
+    } else {
+        aq->train(n, x);
+    }
+}
+void IndexIVFAdditiveQuantizer::encode_vectors(
+        idx_t n,
+        const float* x,
+        const idx_t* list_nos,
+        uint8_t* codes,
+        bool include_listnos) const {
+    FAISS_THROW_IF_NOT(is_trained);
+    // first encode then possibly add listnos
+    if (by_residual) {
+        // subtract centroids
+        std::vector<float> residuals(n * d);
+#pragma omp parallel if (n > 10000)
+        for (idx_t i = 0; i < n; i++) {
+            quantizer->compute_residual(
+                    x + i * d,
+                    residuals.data() + i * d,
+                    list_nos[i] >= 0 ? list_nos[i] : 0);
+        }
+        aq->compute_codes(residuals.data(), codes, n);
+    } else {
+        aq->compute_codes(x, codes, n);
+    }
+    if (include_listnos) {
+        // write back from the end, where there is enough space
+        size_t coarse_size = coarse_code_size();
+        for (idx_t i = n - 1; i >= 0; i--) {
+            uint8_t* code = codes + i * (code_size + coarse_size);
+            memmove(code + coarse_size, codes + i * code_size, code_size);
+            encode_listno(list_nos[i], code);
+        }
+    }
+}
+IndexIVFAdditiveQuantizer::~IndexIVFAdditiveQuantizer() {}
+/*********************************************
+ * AQInvertedListScanner
+ *********************************************/
+namespace {
+using Search_type_t = AdditiveQuantizer::Search_type_t;
+struct AQInvertedListScanner : InvertedListScanner {
+    const IndexIVFAdditiveQuantizer& ia;
+    const AdditiveQuantizer& aq;
+    std::vector<float> tmp;
+    AQInvertedListScanner(const IndexIVFAdditiveQuantizer& ia, bool store_pairs)
+            : ia(ia), aq(*ia.aq) {
+        this->store_pairs = store_pairs;
+        this->code_size = ia.code_size;
+        keep_max = ia.metric_type == METRIC_INNER_PRODUCT;
+        tmp.resize(ia.d);
+    }
+    const float* q0;
+    /// from now on we handle this query.
+    void set_query(const float* query_vector) override {
+        q0 = query_vector;
+    }
+    const float* q;
+    /// following codes come from this inverted list
+    void set_list(idx_t list_no, float coarse_dis) override {
+        if (ia.metric_type == METRIC_L2 && ia.by_residual) {
+            ia.quantizer->compute_residual(q0, tmp.data(), list_no);
+            q = tmp.data();
+        } else {
+            q = q0;
+        }
+    }
+    ~AQInvertedListScanner() {}
+};
+template <bool is_IP>
+struct AQInvertedListScannerDecompress : AQInvertedListScanner {
+    AQInvertedListScannerDecompress(
+            const IndexIVFAdditiveQuantizer& ia,
+            bool store_pairs)
+            : AQInvertedListScanner(ia, store_pairs) {}
+    float coarse_dis = 0;
+    /// following codes come from this inverted list
+    void set_list(idx_t list_no, float coarse_dis) override {
+        AQInvertedListScanner::set_list(list_no, coarse_dis);
+        if (ia.by_residual) {
+            this->coarse_dis = coarse_dis;
+        }
+    }
+    /// compute a single query-to-code distance
+    float distance_to_code(const uint8_t* code) const final {
+        std::vector<float> b(aq.d);
+        aq.decode(code, b.data(), 1);
+        FAISS_ASSERT(q);
+        FAISS_ASSERT(b.data());
+        return is_IP ? coarse_dis + fvec_inner_product(q, b.data(), aq.d)
+                     : fvec_L2sqr(q, b.data(), aq.d);
+    }
+    ~AQInvertedListScannerDecompress() override {}
+};
+template <bool is_IP, Search_type_t search_type>
+struct AQInvertedListScannerLUT : AQInvertedListScanner {
+    std::vector<float> LUT, tmp;
+    float distance_bias;
+    AQInvertedListScannerLUT(
+            const IndexIVFAdditiveQuantizer& ia,
+            bool store_pairs)
+            : AQInvertedListScanner(ia, store_pairs) {
+        LUT.resize(aq.total_codebook_size);
+        tmp.resize(ia.d);
+        distance_bias = 0;
+    }
+    /// from now on we handle this query.
+    void set_query(const float* query_vector) override {
+        AQInvertedListScanner::set_query(query_vector);
+        if (!is_IP && !ia.by_residual) {
+            distance_bias = fvec_norm_L2sqr(query_vector, ia.d);
+        }
+    }
+    /// following codes come from this inverted list
+    void set_list(idx_t list_no, float coarse_dis) override {
+        AQInvertedListScanner::set_list(list_no, coarse_dis);
+        // TODO find a way to provide the nprobes together to do a matmul
+        // +  precompute tables
+        aq.compute_LUT(1, q, LUT.data());
+        if (ia.by_residual) {
+            distance_bias = coarse_dis;
+        }
+    }
+    /// compute a single query-to-code distance
+    float distance_to_code(const uint8_t* code) const final {
+        return distance_bias +
+                aq.compute_1_distance_LUT<is_IP, search_type>(code, LUT.data());
+    }
+    ~AQInvertedListScannerLUT() override {}
+};
+} // anonymous namespace
+InvertedListScanner* IndexIVFAdditiveQuantizer::get_InvertedListScanner(
+        bool store_pairs) const {
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        if (aq->search_type == AdditiveQuantizer::ST_decompress) {
+            return new AQInvertedListScannerDecompress<true>(
+                    *this, store_pairs);
+        } else {
+            return new AQInvertedListScannerLUT<
+                    true,
+                    AdditiveQuantizer::ST_LUT_nonorm>(*this, store_pairs);
+        }
+    } else {
+        switch (aq->search_type) {
+            case AdditiveQuantizer::ST_decompress:
+                return new AQInvertedListScannerDecompress<false>(
+                        *this, store_pairs);
+#define A(st)                                                              \
+    case AdditiveQuantizer::st:                                            \
+        return new AQInvertedListScannerLUT<false, AdditiveQuantizer::st>( \
+                *this, store_pairs);
+                A(ST_LUT_nonorm)
+                // A(ST_norm_from_LUT)
+                A(ST_norm_float)
+                A(ST_norm_qint8)
+                A(ST_norm_qint4)
+                A(ST_norm_cqint8)
+                A(ST_norm_cqint4)
+#undef A
+            default:
+                FAISS_THROW_FMT(
+                        "search type %d not supported", aq->search_type);
+        }
+    }
+}
+/**************************************************************************************
+ * IndexIVFResidualQuantizer
+ **************************************************************************************/
+IndexIVFResidualQuantizer::IndexIVFResidualQuantizer(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        const std::vector<size_t>& nbits,
+        MetricType metric,
+        Search_type_t search_type)
+        : IndexIVFAdditiveQuantizer(&rq, quantizer, d, nlist, metric),
+          rq(d, nbits, search_type) {
+    code_size = invlists->code_size = rq.code_size;
+}
+IndexIVFResidualQuantizer::IndexIVFResidualQuantizer()
+        : IndexIVFAdditiveQuantizer(&rq) {}
+IndexIVFResidualQuantizer::IndexIVFResidualQuantizer(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t M,     /* number of subquantizers */
+        size_t nbits, /* number of bit per subvector index */
+        MetricType metric,
+        Search_type_t search_type)
+        : IndexIVFResidualQuantizer(
+                  quantizer,
+                  d,
+                  nlist,
+                  std::vector<size_t>(M, nbits),
+                  metric,
+                  search_type) {}
+IndexIVFResidualQuantizer::~IndexIVFResidualQuantizer() {}
+/**************************************************************************************
+ * IndexIVFLocalSearchQuantizer
+ **************************************************************************************/
+IndexIVFLocalSearchQuantizer::IndexIVFLocalSearchQuantizer(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t M,     /* number of subquantizers */
+        size_t nbits, /* number of bit per subvector index */
+        MetricType metric,
+        Search_type_t search_type)
+        : IndexIVFAdditiveQuantizer(&lsq, quantizer, d, nlist, metric),
+          lsq(d, M, nbits, search_type) {
+    code_size = invlists->code_size = lsq.code_size;
+}
+IndexIVFLocalSearchQuantizer::IndexIVFLocalSearchQuantizer()
+        : IndexIVFAdditiveQuantizer(&lsq) {}
+IndexIVFLocalSearchQuantizer::~IndexIVFLocalSearchQuantizer() {}
+} // namespace faiss
--- a/faiss/IndexIVFAdditiveQuantizer.h
+++ b/faiss/IndexIVFAdditiveQuantizer.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#ifndef FAISS_INDEX_IVF_ADDITIVE_QUANTIZER_H
+#define FAISS_INDEX_IVF_ADDITIVE_QUANTIZER_H
+#include <faiss/impl/AdditiveQuantizer.h>
+#include <cstdint>
+#include <vector>
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/LocalSearchQuantizer.h>
+#include <faiss/impl/ResidualQuantizer.h>
+#include <faiss/impl/platform_macros.h>
+namespace faiss {
+/// Abstract class for IVF additive quantizers.
+/// The search functions are in common.
+struct IndexIVFAdditiveQuantizer : IndexIVF {
+    // the quantizer
+    AdditiveQuantizer* aq;
+    bool by_residual = true;
+    int use_precomputed_table = 0; // for future use
+    using Search_type_t = AdditiveQuantizer::Search_type_t;
+    IndexIVFAdditiveQuantizer(
+            AdditiveQuantizer* aq,
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            MetricType metric = METRIC_L2);
+    explicit IndexIVFAdditiveQuantizer(AdditiveQuantizer* aq);
+    void train_residual(idx_t n, const float* x) override;
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listnos = false) const override;
+    InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs) const override;
+    ~IndexIVFAdditiveQuantizer() override;
+};
+/** IndexIVF based on a residual quantizer. Stored vectors are
+ * approximated by residual quantization codes.
+ */
+struct IndexIVFResidualQuantizer : IndexIVFAdditiveQuantizer {
+    /// The residual quantizer used to encode the vectors
+    ResidualQuantizer rq;
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexIVFResidualQuantizer(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            const std::vector<size_t>& nbits,
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+    IndexIVFResidualQuantizer(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,     /* number of subquantizers */
+            size_t nbits, /* number of bit per subvector index */
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+    IndexIVFResidualQuantizer();
+    virtual ~IndexIVFResidualQuantizer();
+};
+/** IndexIVF based on a residual quantizer. Stored vectors are
+ * approximated by residual quantization codes.
+ */
+struct IndexIVFLocalSearchQuantizer : IndexIVFAdditiveQuantizer {
+    /// The LSQ quantizer used to encode the vectors
+    LocalSearchQuantizer lsq;
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     */
+    IndexIVFLocalSearchQuantizer(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,     /* number of subquantizers */
+            size_t nbits, /* number of bit per subvector index */
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+    IndexIVFLocalSearchQuantizer();
+    virtual ~IndexIVFLocalSearchQuantizer();
+};
+} // namespace faiss
+#endif
--- a/faiss/IndexIVFFlat.cpp
+++ b/faiss/IndexIVFFlat.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexIVFFlat.h>
+#include <omp.h>
+#include <cinttypes>
+#include <cstdio>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/utils.h>
+namespace faiss {
+/*****************************************
+ * IndexIVFFlat implementation
+ ******************************************/
+IndexIVFFlat::IndexIVFFlat(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        MetricType metric)
+        : IndexIVF(quantizer, d, nlist, sizeof(float) * d, metric) {
+    code_size = sizeof(float) * d;
+}
+void IndexIVFFlat::add_core(
+        idx_t n,
+        const float* x,
+        const int64_t* xids,
+        const int64_t* coarse_idx)
+{
+    FAISS_THROW_IF_NOT(is_trained);
+    FAISS_THROW_IF_NOT(coarse_idx);
+    assert(invlists);
+    direct_map.check_can_add(xids);
+    int64_t n_add = 0;
+    DirectMapAdd dm_adder(direct_map, n, xids);
+#pragma omp parallel reduction(+ : n_add)
+    {
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+        // each thread takes care of a subset of lists
+        for (size_t i = 0; i < n; i++) {
+            idx_t list_no = coarse_idx[i];
+            if (list_no >= 0 && list_no % nt == rank) {
+                idx_t id = xids ? xids[i] : ntotal + i;
+                const float* xi = x + i * d;
+                size_t offset =
+                        invlists->add_entry(list_no, id, (const uint8_t*)xi);
+                dm_adder.add(i, list_no, offset);
+                n_add++;
+            } else if (rank == 0 && list_no == -1) {
+                dm_adder.add(i, -1, 0);
+            }
+        }
+    }
+    if (verbose) {
+        printf("IndexIVFFlat::add_core: added %" PRId64 " / %" PRId64
+               " vectors\n",
+               n_add,
+               n);
+    }
+    ntotal += n;
+}
+void IndexIVFFlat::encode_vectors(
+        idx_t n,
+        const float* x,
+        const idx_t* list_nos,
+        uint8_t* codes,
+        bool include_listnos) const {
+    if (!include_listnos) {
+        memcpy(codes, x, code_size * n);
+    } else {
+        size_t coarse_size = coarse_code_size();
+        for (size_t i = 0; i < n; i++) {
+            int64_t list_no = list_nos[i];
+            uint8_t* code = codes + i * (code_size + coarse_size);
+            const float* xi = x + i * d;
+            if (list_no >= 0) {
+                encode_listno(list_no, code);
+                memcpy(code + coarse_size, xi, code_size);
+            } else {
+                memset(code, 0, code_size + coarse_size);
+            }
+        }
+    }
+}
+void IndexIVFFlat::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
+    size_t coarse_size = coarse_code_size();
+    for (size_t i = 0; i < n; i++) {
+        const uint8_t* code = bytes + i * (code_size + coarse_size);
+        float* xi = x + i * d;
+        memcpy(xi, code + coarse_size, code_size);
+    }
+}
+namespace {
+template <MetricType metric, class C>
+struct IVFFlatScanner : InvertedListScanner {
+    size_t d;
+    IVFFlatScanner(size_t d, bool store_pairs) : d(d) {
+        this->store_pairs = store_pairs;
+    }
+    const float* xi;
+    void set_query(const float* query) override {
+        this->xi = query;
+    }
+    void set_list(idx_t list_no, float /* coarse_dis */) override {
+        this->list_no = list_no;
+    }
+    float distance_to_code(const uint8_t* code) const override {
+        const float* yj = (float*)code;
+        float dis = metric == METRIC_INNER_PRODUCT
+                ? fvec_inner_product(xi, yj, d)
+                : fvec_L2sqr(xi, yj, d);
+        return dis;
+    }
+    size_t scan_codes(
+            size_t list_size,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float* simi,
+            idx_t* idxi,
+            size_t k) const override {
+        const float* list_vecs = (const float*)codes;
+        size_t nup = 0;
+        for (size_t j = 0; j < list_size; j++) {
+            const float* yj = list_vecs + d * j;
+            float dis = metric == METRIC_INNER_PRODUCT
+                    ? fvec_inner_product(xi, yj, d)
+                    : fvec_L2sqr(xi, yj, d);
+            if (C::cmp(simi[0], dis)) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                heap_replace_top<C>(k, simi, idxi, dis, id);
+                nup++;
+            }
+        }
+        return nup;
+    }
+    void scan_codes_range(
+            size_t list_size,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float radius,
+            RangeQueryResult& res) const override {
+        const float* list_vecs = (const float*)codes;
+        for (size_t j = 0; j < list_size; j++) {
+            const float* yj = list_vecs + d * j;
+            float dis = metric == METRIC_INNER_PRODUCT
+                    ? fvec_inner_product(xi, yj, d)
+                    : fvec_L2sqr(xi, yj, d);
+            if (C::cmp(radius, dis)) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                res.add(dis, id);
+            }
+        }
+    }
+};
+} // anonymous namespace
+InvertedListScanner* IndexIVFFlat::get_InvertedListScanner(
+        bool store_pairs) const {
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        return new IVFFlatScanner<METRIC_INNER_PRODUCT, CMin<float, int64_t>>(
+                d, store_pairs);
+    } else if (metric_type == METRIC_L2) {
+        return new IVFFlatScanner<METRIC_L2, CMax<float, int64_t>>(
+                d, store_pairs);
+    } else {
+        FAISS_THROW_MSG("metric type not supported");
+    }
+    return nullptr;
+}
+void IndexIVFFlat::reconstruct_from_offset(
+        int64_t list_no,
+        int64_t offset,
+        float* recons) const {
+    memcpy(recons, invlists->get_single_code(list_no, offset), code_size);
+}
+/*****************************************
+ * IndexIVFFlatDedup implementation
+ ******************************************/
+IndexIVFFlatDedup::IndexIVFFlatDedup(
+        Index* quantizer,
+        size_t d,
+        size_t nlist_,
+        MetricType metric_type)
+        : IndexIVFFlat(quantizer, d, nlist_, metric_type) {}
+void IndexIVFFlatDedup::train(idx_t n, const float* x) {
+    std::unordered_map<uint64_t, idx_t> map;
+    std::unique_ptr<float[]> x2(new float[n * d]);
+    int64_t n2 = 0;
+    for (int64_t i = 0; i < n; i++) {
+        uint64_t hash = hash_bytes((uint8_t*)(x + i * d), code_size);
+        if (map.count(hash) &&
+            !memcmp(x2.get() + map[hash] * d, x + i * d, code_size)) {
+            // is duplicate, skip
+        } else {
+            map[hash] = n2;
+            memcpy(x2.get() + n2 * d, x + i * d, code_size);
+            n2++;
+        }
+    }
+    if (verbose) {
+        printf("IndexIVFFlatDedup::train: train on %" PRId64
+               " points after dedup "
+               "(was %" PRId64 " points)\n",
+               n2,
+               n);
+    }
+    IndexIVFFlat::train(n2, x2.get());
+}
+void IndexIVFFlatDedup::add_with_ids(
+        idx_t na,
+        const float* x,
+        const idx_t* xids) {
+    FAISS_THROW_IF_NOT(is_trained);
+    assert(invlists);
+    FAISS_THROW_IF_NOT_MSG(
+            direct_map.no(), "IVFFlatDedup not implemented with direct_map");
+    std::unique_ptr<int64_t[]> idx(new int64_t[na]);
+    quantizer->assign(na, x, idx.get());
+    int64_t n_add = 0, n_dup = 0;
+#pragma omp parallel reduction(+ : n_add, n_dup)
+    {
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+        // each thread takes care of a subset of lists
+        for (size_t i = 0; i < na; i++) {
+            int64_t list_no = idx[i];
+            if (list_no < 0 || list_no % nt != rank) {
+                continue;
+            }
+            idx_t id = xids ? xids[i] : ntotal + i;
+            const float* xi = x + i * d;
+            // search if there is already an entry with that id
+            InvertedLists::ScopedCodes codes(invlists, list_no);
+            int64_t n = invlists->list_size(list_no);
+            int64_t offset = -1;
+            for (int64_t o = 0; o < n; o++) {
+                if (!memcmp(codes.get() + o * code_size, xi, code_size)) {
+                    offset = o;
+                    break;
+                }
+            }
+            if (offset == -1) { // not found
+                invlists->add_entry(list_no, id, (const uint8_t*)xi);
+            } else {
+                // mark equivalence
+                idx_t id2 = invlists->get_single_id(list_no, offset);
+                std::pair<idx_t, idx_t> pair(id2, id);
+#pragma omp critical
+                // executed by one thread at a time
+                instances.insert(pair);
+                n_dup++;
+            }
+            n_add++;
+        }
+    }
+    if (verbose) {
+        printf("IndexIVFFlat::add_with_ids: added %" PRId64 " / %" PRId64
+               " vectors"
+               " (out of which %" PRId64 " are duplicates)\n",
+               n_add,
+               na,
+               n_dup);
+    }
+    ntotal += n_add;
+}
+void IndexIVFFlatDedup::search_preassigned(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        const idx_t* assign,
+        const float* centroid_dis,
+        float* distances,
+        idx_t* labels,
+        bool store_pairs,
+        const IVFSearchParameters* params,
+        IndexIVFStats* stats) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !store_pairs, "store_pairs not supported in IVFDedup");
+    IndexIVFFlat::search_preassigned(
+            n, x, k, assign, centroid_dis, distances, labels, false, params);
+    std::vector<idx_t> labels2(k);
+    std::vector<float> dis2(k);
+    for (int64_t i = 0; i < n; i++) {
+        idx_t* labels1 = labels + i * k;
+        float* dis1 = distances + i * k;
+        int64_t j = 0;
+        for (; j < k; j++) {
+            if (instances.find(labels1[j]) != instances.end()) {
+                // a duplicate: special handling
+                break;
+            }
+        }
+        if (j < k) {
+            // there are duplicates, special handling
+            int64_t j0 = j;
+            int64_t rp = j;
+            while (j < k) {
+                auto range = instances.equal_range(labels1[rp]);
+                float dis = dis1[rp];
+                labels2[j] = labels1[rp];
+                dis2[j] = dis;
+                j++;
+                for (auto it = range.first; j < k && it != range.second; ++it) {
+                    labels2[j] = it->second;
+                    dis2[j] = dis;
+                    j++;
+                }
+                rp++;
+            }
+            memcpy(labels1 + j0,
+                   labels2.data() + j0,
+                   sizeof(labels1[0]) * (k - j0));
+            memcpy(dis1 + j0, dis2.data() + j0, sizeof(dis2[0]) * (k - j0));
+        }
+    }
+}
+size_t IndexIVFFlatDedup::remove_ids(const IDSelector& sel) {
+    std::unordered_map<idx_t, idx_t> replace;
+    std::vector<std::pair<idx_t, idx_t>> toadd;
+    for (auto it = instances.begin(); it != instances.end();) {
+        if (sel.is_member(it->first)) {
+            // then we erase this entry
+            if (!sel.is_member(it->second)) {
+                // if the second is not erased
+                if (replace.count(it->first) == 0) {
+                    replace[it->first] = it->second;
+                } else { // remember we should add an element
+                    std::pair<idx_t, idx_t> new_entry(
+                            replace[it->first], it->second);
+                    toadd.push_back(new_entry);
+                }
+            }
+            it = instances.erase(it);
+        } else {
+            if (sel.is_member(it->second)) {
+                it = instances.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+    instances.insert(toadd.begin(), toadd.end());
+    // mostly copied from IndexIVF.cpp
+    FAISS_THROW_IF_NOT_MSG(
+            direct_map.no(), "direct map remove not implemented");
+    std::vector<int64_t> toremove(nlist);
+#pragma omp parallel for
+    for (int64_t i = 0; i < nlist; i++) {
+        int64_t l0 = invlists->list_size(i), l = l0, j = 0;
+        InvertedLists::ScopedIds idsi(invlists, i);
+        while (j < l) {
+            if (sel.is_member(idsi[j])) {
+                if (replace.count(idsi[j]) == 0) {
+                    l--;
+                    invlists->update_entry(
+                            i,
+                            j,
+                            invlists->get_single_id(i, l),
+                            InvertedLists::ScopedCodes(invlists, i, l).get());
+                } else {
+                    invlists->update_entry(
+                            i,
+                            j,
+                            replace[idsi[j]],
+                            InvertedLists::ScopedCodes(invlists, i, j).get());
+                    j++;
+                }
+            } else {
+                j++;
+            }
+        }
+        toremove[i] = l0 - l;
+    }
+    // this will not run well in parallel on ondisk because of possible shrinks
+    int64_t nremove = 0;
+    for (int64_t i = 0; i < nlist; i++) {
+        if (toremove[i] > 0) {
+            nremove += toremove[i];
+            invlists->resize(i, invlists->list_size(i) - toremove[i]);
+        }
+    }
+    ntotal -= nremove;
+    return nremove;
+}
+void IndexIVFFlatDedup::range_search(
+        idx_t,
+        const float*,
+        float,
+        RangeSearchResult*) const {
+    FAISS_THROW_MSG("not implemented");
+}
+void IndexIVFFlatDedup::update_vectors(int, const idx_t*, const float*) {
+    FAISS_THROW_MSG("not implemented");
+}
+void IndexIVFFlatDedup::reconstruct_from_offset(int64_t, int64_t, float*)
+        const {
+    FAISS_THROW_MSG("not implemented");
+}
+} // namespace faiss
--- a/faiss/IndexIVFFlat.h
+++ b/faiss/IndexIVFFlat.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_INDEX_IVF_FLAT_H
+#define FAISS_INDEX_IVF_FLAT_H
+#include <stdint.h>
+#include <unordered_map>
+#include <faiss/IndexIVF.h>
+namespace faiss {
+/** Inverted file with stored vectors. Here the inverted file
+ * pre-selects the vectors to be searched, but they are not otherwise
+ * encoded, the code array just contains the raw float entries.
+ */
+struct IndexIVFFlat : IndexIVF {
+    IndexIVFFlat(
+            Index* quantizer,
+            size_t d,
+            size_t nlist_,
+            MetricType = METRIC_L2);
+    void add_core(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx) override;
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listnos = false) const override;
+    InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs) const override;
+    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
+            const override;
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+    IndexIVFFlat() {}
+};
+struct IndexIVFFlatDedup : IndexIVFFlat {
+    /** Maps ids stored in the index to the ids of vectors that are
+     *  the same. When a vector is unique, it does not appear in the
+     *  instances map */
+    std::unordered_multimap<idx_t, idx_t> instances;
+    IndexIVFFlatDedup(
+            Index* quantizer,
+            size_t d,
+            size_t nlist_,
+            MetricType = METRIC_L2);
+    /// also dedups the training set
+    void train(idx_t n, const float* x) override;
+    /// implemented for all IndexIVF* classes
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+    void search_preassigned(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            const idx_t* assign,
+            const float* centroid_dis,
+            float* distances,
+            idx_t* labels,
+            bool store_pairs,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const override;
+    size_t remove_ids(const IDSelector& sel) override;
+    /// not implemented
+    void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result) const override;
+    /// not implemented
+    void update_vectors(int nv, const idx_t* idx, const float* v) override;
+    /// not implemented
+    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
+            const override;
+    IndexIVFFlatDedup() {}
+};
+} // namespace faiss
+#endif
--- a/faiss/IndexIVFPQ.cpp
+++ b/faiss/IndexIVFPQ.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexIVFPQ.h>
+#include <stdint.h>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <algorithm>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/utils.h>
+#include <faiss/Clustering.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+namespace faiss {
+/*****************************************
+ * IndexIVFPQ implementation
+ ******************************************/
+IndexIVFPQ::IndexIVFPQ(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t M,
+        size_t nbits_per_idx,
+        MetricType metric)
+        : IndexIVF(quantizer, d, nlist, 0, metric), pq(d, M, nbits_per_idx) {
+    FAISS_THROW_IF_NOT(nbits_per_idx <= 8);
+    code_size = pq.code_size;
+    invlists->code_size = code_size;
+    is_trained = false;
+    by_residual = true;
+    use_precomputed_table = 0;
+    scan_table_threshold = 0;
+    polysemous_training = nullptr;
+    do_polysemous_training = false;
+    polysemous_ht = 0;
+}
+/****************************************************************
+ * training                                                     */
+void IndexIVFPQ::train_residual(idx_t n, const float* x) {
+    train_residual_o(n, x, nullptr);
+}
+void IndexIVFPQ::train_residual_o(idx_t n, const float* x, float* residuals_2) {
+    const float* x_in = x;
+    x = fvecs_maybe_subsample(
+            d,
+            (size_t*)&n,
+            pq.cp.max_points_per_centroid * pq.ksub,
+            x,
+            verbose,
+            pq.cp.seed);
+    ScopeDeleter<float> del_x(x_in == x ? nullptr : x);
+    const float* trainset;
+    ScopeDeleter<float> del_residuals;
+    if (by_residual) {
+        if (verbose)
+            printf("computing residuals\n");
+        idx_t* assign = new idx_t[n]; // assignement to coarse centroids
+        ScopeDeleter<idx_t> del(assign);
+        quantizer->assign(n, x, assign);
+        float* residuals = new float[n * d];
+        del_residuals.set(residuals);
+        for (idx_t i = 0; i < n; i++)
+            quantizer->compute_residual(
+                    x + i * d, residuals + i * d, assign[i]);
+        trainset = residuals;
+    } else {
+        trainset = x;
+    }
+    if (verbose)
+        printf("training %zdx%zd product quantizer on %" PRId64
+               " vectors in %dD\n",
+               pq.M,
+               pq.ksub,
+               n,
+               d);
+    pq.verbose = verbose;
+    pq.train(n, trainset);
+    if (do_polysemous_training) {
+        if (verbose)
+            printf("doing polysemous training for PQ\n");
+        PolysemousTraining default_pt;
+        PolysemousTraining* pt = polysemous_training;
+        if (!pt)
+            pt = &default_pt;
+        pt->optimize_pq_for_hamming(pq, n, trainset);
+    }
+    // prepare second-level residuals for refine PQ
+    if (residuals_2) {
+        uint8_t* train_codes = new uint8_t[pq.code_size * n];
+        ScopeDeleter<uint8_t> del(train_codes);
+        pq.compute_codes(trainset, train_codes, n);
+        for (idx_t i = 0; i < n; i++) {
+            const float* xx = trainset + i * d;
+            float* res = residuals_2 + i * d;
+            pq.decode(train_codes + i * pq.code_size, res);
+            for (int j = 0; j < d; j++)
+                res[j] = xx[j] - res[j];
+        }
+    }
+    if (by_residual) {
+        precompute_table();
+    }
+}
+/****************************************************************
+ * IVFPQ as codec                                               */
+/* produce a binary signature based on the residual vector */
+void IndexIVFPQ::encode(idx_t key, const float* x, uint8_t* code) const {
+    if (by_residual) {
+        std::vector<float> residual_vec(d);
+        quantizer->compute_residual(x, residual_vec.data(), key);
+        pq.compute_code(residual_vec.data(), code);
+    } else
+        pq.compute_code(x, code);
+}
+void IndexIVFPQ::encode_multiple(
+        size_t n,
+        idx_t* keys,
+        const float* x,
+        uint8_t* xcodes,
+        bool compute_keys) const {
+    if (compute_keys)
+        quantizer->assign(n, x, keys);
+    encode_vectors(n, x, keys, xcodes);
+}
+void IndexIVFPQ::decode_multiple(
+        size_t n,
+        const idx_t* keys,
+        const uint8_t* xcodes,
+        float* x) const {
+    pq.decode(xcodes, x, n);
+    if (by_residual) {
+        std::vector<float> centroid(d);
+        for (size_t i = 0; i < n; i++) {
+            quantizer->reconstruct(keys[i], centroid.data());
+            float* xi = x + i * d;
+            for (size_t j = 0; j < d; j++) {
+                xi[j] += centroid[j];
+            }
+        }
+    }
+}
+/****************************************************************
+ * add                                                          */
+void IndexIVFPQ::add_core(
+        idx_t n,
+        const float* x,
+        const idx_t* xids,
+        const idx_t* coarse_idx) {
+    add_core_o(n, x, xids, nullptr, coarse_idx);
+}
+static float* compute_residuals(
+        const Index* quantizer,
+        Index::idx_t n,
+        const float* x,
+        const Index::idx_t* list_nos) {
+    size_t d = quantizer->d;
+    float* residuals = new float[n * d];
+    // TODO: parallelize?
+    for (size_t i = 0; i < n; i++) {
+        if (list_nos[i] < 0)
+            memset(residuals + i * d, 0, sizeof(*residuals) * d);
+        else
+            quantizer->compute_residual(
+                    x + i * d, residuals + i * d, list_nos[i]);
+    }
+    return residuals;
+}
+void IndexIVFPQ::encode_vectors(
+        idx_t n,
+        const float* x,
+        const idx_t* list_nos,
+        uint8_t* codes,
+        bool include_listnos) const {
+    if (by_residual) {
+        float* to_encode = compute_residuals(quantizer, n, x, list_nos);
+        ScopeDeleter<float> del(to_encode);
+        pq.compute_codes(to_encode, codes, n);
+    } else {
+        pq.compute_codes(x, codes, n);
+    }
+    if (include_listnos) {
+        size_t coarse_size = coarse_code_size();
+        for (idx_t i = n - 1; i >= 0; i--) {
+            uint8_t* code = codes + i * (coarse_size + code_size);
+            memmove(code + coarse_size, codes + i * code_size, code_size);
+            encode_listno(list_nos[i], code);
+        }
+    }
+}
+void IndexIVFPQ::sa_decode(idx_t n, const uint8_t* codes, float* x) const {
+    size_t coarse_size = coarse_code_size();
+#pragma omp parallel
+    {
+        std::vector<float> residual(d);
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const uint8_t* code = codes + i * (code_size + coarse_size);
+            int64_t list_no = decode_listno(code);
+            float* xi = x + i * d;
+            pq.decode(code + coarse_size, xi);
+            if (by_residual) {
+                quantizer->reconstruct(list_no, residual.data());
+                for (size_t j = 0; j < d; j++) {
+                    xi[j] += residual[j];
+                }
+            }
+        }
+    }
+}
+void IndexIVFPQ::add_core_o(
+        idx_t n,
+        const float* x,
+        const idx_t* xids,
+        float* residuals_2,
+        const idx_t* precomputed_idx) {
+    idx_t bs = 32768;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(i0 + bs, n);
+            if (verbose) {
+                printf("IndexIVFPQ::add_core_o: adding %" PRId64 ":%" PRId64
+                       " / %" PRId64 "\n",
+                       i0,
+                       i1,
+                       n);
+            }
+            add_core_o(
+                    i1 - i0,
+                    x + i0 * d,
+                    xids ? xids + i0 : nullptr,
+                    residuals_2 ? residuals_2 + i0 * d : nullptr,
+                    precomputed_idx ? precomputed_idx + i0 : nullptr);
+        }
+        return;
+    }
+    InterruptCallback::check();
+    direct_map.check_can_add(xids);
+    FAISS_THROW_IF_NOT(is_trained);
+    double t0 = getmillisecs();
+    const idx_t* idx;
+    ScopeDeleter<idx_t> del_idx;
+    if (precomputed_idx) {
+        idx = precomputed_idx;
+    } else {
+        idx_t* idx0 = new idx_t[n];
+        del_idx.set(idx0);
+        quantizer->assign(n, x, idx0);
+        idx = idx0;
+    }
+    double t1 = getmillisecs();
+    uint8_t* xcodes = new uint8_t[n * code_size];
+    ScopeDeleter<uint8_t> del_xcodes(xcodes);
+    const float* to_encode = nullptr;
+    ScopeDeleter<float> del_to_encode;
+    if (by_residual) {
+        to_encode = compute_residuals(quantizer, n, x, idx);
+        del_to_encode.set(to_encode);
+    } else {
+        to_encode = x;
+    }
+    pq.compute_codes(to_encode, xcodes, n);
+    double t2 = getmillisecs();
+    // TODO: parallelize?
+    size_t n_ignore = 0;
+    for (size_t i = 0; i < n; i++) {
+        idx_t key = idx[i];
+        idx_t id = xids ? xids[i] : ntotal + i;
+        if (key < 0) {
+            direct_map.add_single_id(id, -1, 0);
+            n_ignore++;
+            if (residuals_2)
+                memset(residuals_2, 0, sizeof(*residuals_2) * d);
+            continue;
+        }
+        uint8_t* code = xcodes + i * code_size;
+        size_t offset = invlists->add_entry(key, id, code);
+        if (residuals_2) {
+            float* res2 = residuals_2 + i * d;
+            const float* xi = to_encode + i * d;
+            pq.decode(code, res2);
+            for (int j = 0; j < d; j++)
+                res2[j] = xi[j] - res2[j];
+        }
+        direct_map.add_single_id(id, key, offset);
+    }
+    double t3 = getmillisecs();
+    if (verbose) {
+        char comment[100] = {0};
+        if (n_ignore > 0)
+            snprintf(comment, 100, "(%zd vectors ignored)", n_ignore);
+        printf(" add_core times: %.3f %.3f %.3f %s\n",
+               t1 - t0,
+               t2 - t1,
+               t3 - t2,
+               comment);
+    }
+    ntotal += n;
+}
+void IndexIVFPQ::reconstruct_from_offset(
+        int64_t list_no,
+        int64_t offset,
+        float* recons) const {
+    const uint8_t* code = invlists->get_single_code(list_no, offset);
+    if (by_residual) {
+        std::vector<float> centroid(d);
+        quantizer->reconstruct(list_no, centroid.data());
+        pq.decode(code, recons);
+        for (int i = 0; i < d; ++i) {
+            recons[i] += centroid[i];
+        }
+    } else {
+        pq.decode(code, recons);
+    }
+}
+/// 2G by default, accommodates tables up to PQ32 w/ 65536 centroids
+size_t precomputed_table_max_bytes = ((size_t)1) << 31;
+/** Precomputed tables for residuals
+ *
+ * During IVFPQ search with by_residual, we compute
+ *
+ *     d = || x - y_C - y_R ||^2
+ *
+ * where x is the query vector, y_C the coarse centroid, y_R the
+ * refined PQ centroid. The expression can be decomposed as:
+ *
+ *    d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
+ *        ---------------   ---------------------------       -------
+ *             term 1                 term 2                   term 3
+ *
+ * When using multiprobe, we use the following decomposition:
+ * - term 1 is the distance to the coarse centroid, that is computed
+ *   during the 1st stage search.
+ * - term 2 can be precomputed, as it does not involve x. However,
+ *   because of the PQ, it needs nlist * M * ksub storage. This is why
+ *   use_precomputed_table is off by default
+ * - term 3 is the classical non-residual distance table.
+ *
+ * Since y_R defined by a product quantizer, it is split across
+ * subvectors and stored separately for each subvector. If the coarse
+ * quantizer is a MultiIndexQuantizer then the table can be stored
+ * more compactly.
+ *
+ * At search time, the tables for term 2 and term 3 are added up. This
+ * is faster when the length of the lists is > ksub * M.
+ */
+void initialize_IVFPQ_precomputed_table(
+        int& use_precomputed_table,
+        const Index* quantizer,
+        const ProductQuantizer& pq,
+        AlignedTable<float>& precomputed_table,
+        bool verbose) {
+    size_t nlist = quantizer->ntotal;
+    size_t d = quantizer->d;
+    FAISS_THROW_IF_NOT(d == pq.d);
+    if (use_precomputed_table == -1) {
+        precomputed_table.resize(0);
+        return;
+    }
+    if (use_precomputed_table == 0) { // then choose the type of table
+        if (quantizer->metric_type == METRIC_INNER_PRODUCT) {
+            if (verbose) {
+                printf("IndexIVFPQ::precompute_table: precomputed "
+                       "tables not needed for inner product quantizers\n");
+            }
+            precomputed_table.resize(0);
+            return;
+        }
+        const MultiIndexQuantizer* miq =
+                dynamic_cast<const MultiIndexQuantizer*>(quantizer);
+        if (miq && pq.M % miq->pq.M == 0)
+            use_precomputed_table = 2;
+        else {
+            size_t table_size = pq.M * pq.ksub * nlist * sizeof(float);
+            if (table_size > precomputed_table_max_bytes) {
+                if (verbose) {
+                    printf("IndexIVFPQ::precompute_table: not precomputing table, "
+                           "it would be too big: %zd bytes (max %zd)\n",
+                           table_size,
+                           precomputed_table_max_bytes);
+                    use_precomputed_table = 0;
+                }
+                return;
+            }
+            use_precomputed_table = 1;
+        }
+    } // otherwise assume user has set appropriate flag on input
+    if (verbose) {
+        printf("precomputing IVFPQ tables type %d\n", use_precomputed_table);
+    }
+    // squared norms of the PQ centroids
+    std::vector<float> r_norms(pq.M * pq.ksub, NAN);
+    for (int m = 0; m < pq.M; m++)
+        for (int j = 0; j < pq.ksub; j++)
+            r_norms[m * pq.ksub + j] =
+                    fvec_norm_L2sqr(pq.get_centroids(m, j), pq.dsub);
+    if (use_precomputed_table == 1) {
+        precomputed_table.resize(nlist * pq.M * pq.ksub);
+        std::vector<float> centroid(d);
+        for (size_t i = 0; i < nlist; i++) {
+            quantizer->reconstruct(i, centroid.data());
+            float* tab = &precomputed_table[i * pq.M * pq.ksub];
+            pq.compute_inner_prod_table(centroid.data(), tab);
+            fvec_madd(pq.M * pq.ksub, r_norms.data(), 2.0, tab, tab);
+        }
+    } else if (use_precomputed_table == 2) {
+        const MultiIndexQuantizer* miq =
+                dynamic_cast<const MultiIndexQuantizer*>(quantizer);
+        FAISS_THROW_IF_NOT(miq);
+        const ProductQuantizer& cpq = miq->pq;
+        FAISS_THROW_IF_NOT(pq.M % cpq.M == 0);
+        precomputed_table.resize(cpq.ksub * pq.M * pq.ksub);
+        // reorder PQ centroid table
+        std::vector<float> centroids(d * cpq.ksub, NAN);
+        for (int m = 0; m < cpq.M; m++) {
+            for (size_t i = 0; i < cpq.ksub; i++) {
+                memcpy(centroids.data() + i * d + m * cpq.dsub,
+                       cpq.get_centroids(m, i),
+                       sizeof(*centroids.data()) * cpq.dsub);
+            }
+        }
+        pq.compute_inner_prod_tables(
+                cpq.ksub, centroids.data(), precomputed_table.data());
+        for (size_t i = 0; i < cpq.ksub; i++) {
+            float* tab = &precomputed_table[i * pq.M * pq.ksub];
+            fvec_madd(pq.M * pq.ksub, r_norms.data(), 2.0, tab, tab);
+        }
+    }
+}
+void IndexIVFPQ::precompute_table() {
+    initialize_IVFPQ_precomputed_table(
+            use_precomputed_table, quantizer, pq, precomputed_table, verbose);
+}
+namespace {
+using idx_t = Index::idx_t;
+#define TIC t0 = get_cycles()
+#define TOC get_cycles() - t0
+/** QueryTables manages the various ways of searching an
+ * IndexIVFPQ. The code contains a lot of branches, depending on:
+ * - metric_type: are we computing L2 or Inner product similarity?
+ * - by_residual: do we encode raw vectors or residuals?
+ * - use_precomputed_table: are x_R|x_C tables precomputed?
+ * - polysemous_ht: are we filtering with polysemous codes?
+ */
+struct QueryTables {
+    /*****************************************************
+     * General data from the IVFPQ
+     *****************************************************/
+    const IndexIVFPQ& ivfpq;
+    const IVFSearchParameters* params;
+    // copied from IndexIVFPQ for easier access
+    int d;
+    const ProductQuantizer& pq;
+    MetricType metric_type;
+    bool by_residual;
+    int use_precomputed_table;
+    int polysemous_ht;
+    // pre-allocated data buffers
+    float *sim_table, *sim_table_2;
+    float *residual_vec, *decoded_vec;
+    // single data buffer
+    std::vector<float> mem;
+    // for table pointers
+    std::vector<const float*> sim_table_ptrs;
+    explicit QueryTables(
+            const IndexIVFPQ& ivfpq,
+            const IVFSearchParameters* params)
+            : ivfpq(ivfpq),
+              d(ivfpq.d),
+              pq(ivfpq.pq),
+              metric_type(ivfpq.metric_type),
+              by_residual(ivfpq.by_residual),
+              use_precomputed_table(ivfpq.use_precomputed_table) {
+        mem.resize(pq.ksub * pq.M * 2 + d * 2);
+        sim_table = mem.data();
+        sim_table_2 = sim_table + pq.ksub * pq.M;
+        residual_vec = sim_table_2 + pq.ksub * pq.M;
+        decoded_vec = residual_vec + d;
+        // for polysemous
+        polysemous_ht = ivfpq.polysemous_ht;
+        if (auto ivfpq_params =
+                    dynamic_cast<const IVFPQSearchParameters*>(params)) {
+            polysemous_ht = ivfpq_params->polysemous_ht;
+        }
+        if (polysemous_ht != 0) {
+            q_code.resize(pq.code_size);
+        }
+        init_list_cycles = 0;
+        sim_table_ptrs.resize(pq.M);
+    }
+    /*****************************************************
+     * What we do when query is known
+     *****************************************************/
+    // field specific to query
+    const float* qi;
+    // query-specific initialization
+    void init_query(const float* qi) {
+        this->qi = qi;
+        if (metric_type == METRIC_INNER_PRODUCT)
+            init_query_IP();
+        else
+            init_query_L2();
+        if (!by_residual && polysemous_ht != 0)
+            pq.compute_code(qi, q_code.data());
+    }
+    void init_query_IP() {
+        // precompute some tables specific to the query qi
+        pq.compute_inner_prod_table(qi, sim_table);
+    }
+    void init_query_L2() {
+        if (!by_residual) {
+            pq.compute_distance_table(qi, sim_table);
+        } else if (use_precomputed_table) {
+            pq.compute_inner_prod_table(qi, sim_table_2);
+        }
+    }
+    /*****************************************************
+     * When inverted list is known: prepare computations
+     *****************************************************/
+    // fields specific to list
+    Index::idx_t key;
+    float coarse_dis;
+    std::vector<uint8_t> q_code;
+    uint64_t init_list_cycles;
+    /// once we know the query and the centroid, we can prepare the
+    /// sim_table that will be used for accumulation
+    /// and dis0, the initial value
+    float precompute_list_tables() {
+        float dis0 = 0;
+        uint64_t t0;
+        TIC;
+        if (by_residual) {
+            if (metric_type == METRIC_INNER_PRODUCT)
+                dis0 = precompute_list_tables_IP();
+            else
+                dis0 = precompute_list_tables_L2();
+        }
+        init_list_cycles += TOC;
+        return dis0;
+    }
+    float precompute_list_table_pointers() {
+        float dis0 = 0;
+        uint64_t t0;
+        TIC;
+        if (by_residual) {
+            if (metric_type == METRIC_INNER_PRODUCT)
+                FAISS_THROW_MSG("not implemented");
+            else
+                dis0 = precompute_list_table_pointers_L2();
+        }
+        init_list_cycles += TOC;
+        return dis0;
+    }
+    /*****************************************************
+     * compute tables for inner prod
+     *****************************************************/
+    float precompute_list_tables_IP() {
+        // prepare the sim_table that will be used for accumulation
+        // and dis0, the initial value
+        ivfpq.quantizer->reconstruct(key, decoded_vec);
+        // decoded_vec = centroid
+        float dis0 = fvec_inner_product(qi, decoded_vec, d);
+        if (polysemous_ht) {
+            for (int i = 0; i < d; i++) {
+                residual_vec[i] = qi[i] - decoded_vec[i];
+            }
+            pq.compute_code(residual_vec, q_code.data());
+        }
+        return dis0;
+    }
+    /*****************************************************
+     * compute tables for L2 distance
+     *****************************************************/
+    float precompute_list_tables_L2() {
+        float dis0 = 0;
+        if (use_precomputed_table == 0 || use_precomputed_table == -1) {
+            ivfpq.quantizer->compute_residual(qi, residual_vec, key);
+            pq.compute_distance_table(residual_vec, sim_table);
+            if (polysemous_ht != 0) {
+                pq.compute_code(residual_vec, q_code.data());
+            }
+        } else if (use_precomputed_table == 1) {
+            dis0 = coarse_dis;
+            fvec_madd(
+                    pq.M * pq.ksub,
+                    ivfpq.precomputed_table.data() + key * pq.ksub * pq.M,
+                    -2.0,
+                    sim_table_2,
+                    sim_table);
+            if (polysemous_ht != 0) {
+                ivfpq.quantizer->compute_residual(qi, residual_vec, key);
+                pq.compute_code(residual_vec, q_code.data());
+            }
+        } else if (use_precomputed_table == 2) {
+            dis0 = coarse_dis;
+            const MultiIndexQuantizer* miq =
+                    dynamic_cast<const MultiIndexQuantizer*>(ivfpq.quantizer);
+            FAISS_THROW_IF_NOT(miq);
+            const ProductQuantizer& cpq = miq->pq;
+            int Mf = pq.M / cpq.M;
+            const float* qtab = sim_table_2; // query-specific table
+            float* ltab = sim_table;         // (output) list-specific table
+            long k = key;
+            for (int cm = 0; cm < cpq.M; cm++) {
+                // compute PQ index
+                int ki = k & ((uint64_t(1) << cpq.nbits) - 1);
+                k >>= cpq.nbits;
+                // get corresponding table
+                const float* pc = ivfpq.precomputed_table.data() +
+                        (ki * pq.M + cm * Mf) * pq.ksub;
+                if (polysemous_ht == 0) {
+                    // sum up with query-specific table
+                    fvec_madd(Mf * pq.ksub, pc, -2.0, qtab, ltab);
+                    ltab += Mf * pq.ksub;
+                    qtab += Mf * pq.ksub;
+                } else {
+                    for (int m = cm * Mf; m < (cm + 1) * Mf; m++) {
+                        q_code[m] = fvec_madd_and_argmin(
+                                pq.ksub, pc, -2, qtab, ltab);
+                        pc += pq.ksub;
+                        ltab += pq.ksub;
+                        qtab += pq.ksub;
+                    }
+                }
+            }
+        }
+        return dis0;
+    }
+    float precompute_list_table_pointers_L2() {
+        float dis0 = 0;
+        if (use_precomputed_table == 1) {
+            dis0 = coarse_dis;
+            const float* s =
+                    ivfpq.precomputed_table.data() + key * pq.ksub * pq.M;
+            for (int m = 0; m < pq.M; m++) {
+                sim_table_ptrs[m] = s;
+                s += pq.ksub;
+            }
+        } else if (use_precomputed_table == 2) {
+            dis0 = coarse_dis;
+            const MultiIndexQuantizer* miq =
+                    dynamic_cast<const MultiIndexQuantizer*>(ivfpq.quantizer);
+            FAISS_THROW_IF_NOT(miq);
+            const ProductQuantizer& cpq = miq->pq;
+            int Mf = pq.M / cpq.M;
+            long k = key;
+            int m0 = 0;
+            for (int cm = 0; cm < cpq.M; cm++) {
+                int ki = k & ((uint64_t(1) << cpq.nbits) - 1);
+                k >>= cpq.nbits;
+                const float* pc = ivfpq.precomputed_table.data() +
+                        (ki * pq.M + cm * Mf) * pq.ksub;
+                for (int m = m0; m < m0 + Mf; m++) {
+                    sim_table_ptrs[m] = pc;
+                    pc += pq.ksub;
+                }
+                m0 += Mf;
+            }
+        } else {
+            FAISS_THROW_MSG("need precomputed tables");
+        }
+        if (polysemous_ht) {
+            FAISS_THROW_MSG("not implemented");
+            // Not clear that it makes sense to implemente this,
+            // because it costs M * ksub, which is what we wanted to
+            // avoid with the tables pointers.
+        }
+        return dis0;
+    }
+};
+template <class C>
+struct KnnSearchResults {
+    idx_t key;
+    const idx_t* ids;
+    // heap params
+    size_t k;
+    float* heap_sim;
+    idx_t* heap_ids;
+    size_t nup;
+    inline void add(idx_t j, float dis) {
+        if (C::cmp(heap_sim[0], dis)) {
+            idx_t id = ids ? ids[j] : lo_build(key, j);
+            heap_replace_top<C>(k, heap_sim, heap_ids, dis, id);
+            nup++;
+        }
+    }
+};
+template <class C>
+struct RangeSearchResults {
+    idx_t key;
+    const idx_t* ids;
+    // wrapped result structure
+    float radius;
+    RangeQueryResult& rres;
+    inline void add(idx_t j, float dis) {
+        if (C::cmp(radius, dis)) {
+            idx_t id = ids ? ids[j] : lo_build(key, j);
+            rres.add(dis, id);
+        }
+    }
+};
+/*****************************************************
+ * Scaning the codes.
+ * The scanning functions call their favorite precompute_*
+ * function to precompute the tables they need.
+ *****************************************************/
+template <typename IDType, MetricType METRIC_TYPE, class PQDecoder>
+struct IVFPQScannerT : QueryTables {
+    const uint8_t* list_codes;
+    const IDType* list_ids;
+    size_t list_size;
+    IVFPQScannerT(const IndexIVFPQ& ivfpq, const IVFSearchParameters* params)
+            : QueryTables(ivfpq, params) {
+        assert(METRIC_TYPE == metric_type);
+    }
+    float dis0;
+    void init_list(idx_t list_no, float coarse_dis, int mode) {
+        this->key = list_no;
+        this->coarse_dis = coarse_dis;
+        if (mode == 2) {
+            dis0 = precompute_list_tables();
+        } else if (mode == 1) {
+            dis0 = precompute_list_table_pointers();
+        }
+    }
+    /*****************************************************
+     * Scaning the codes: simple PQ scan.
+     *****************************************************/
+    /// version of the scan where we use precomputed tables
+    template <class SearchResultType>
+    void scan_list_with_table(
+            size_t ncode,
+            const uint8_t* codes,
+            SearchResultType& res) const {
+        for (size_t j = 0; j < ncode; j++) {
+            PQDecoder decoder(codes, pq.nbits);
+            codes += pq.code_size;
+            float dis = dis0;
+            const float* tab = sim_table;
+            for (size_t m = 0; m < pq.M; m++) {
+                dis += tab[decoder.decode()];
+                tab += pq.ksub;
+            }
+            res.add(j, dis);
+        }
+    }
+    /// tables are not precomputed, but pointers are provided to the
+    /// relevant X_c|x_r tables
+    template <class SearchResultType>
+    void scan_list_with_pointer(
+            size_t ncode,
+            const uint8_t* codes,
+            SearchResultType& res) const {
+        for (size_t j = 0; j < ncode; j++) {
+            PQDecoder decoder(codes, pq.nbits);
+            codes += pq.code_size;
+            float dis = dis0;
+            const float* tab = sim_table_2;
+            for (size_t m = 0; m < pq.M; m++) {
+                int ci = decoder.decode();
+                dis += sim_table_ptrs[m][ci] - 2 * tab[ci];
+                tab += pq.ksub;
+            }
+            res.add(j, dis);
+        }
+    }
+    /// nothing is precomputed: access residuals on-the-fly
+    template <class SearchResultType>
+    void scan_on_the_fly_dist(
+            size_t ncode,
+            const uint8_t* codes,
+            SearchResultType& res) const {
+        const float* dvec;
+        float dis0 = 0;
+        if (by_residual) {
+            if (METRIC_TYPE == METRIC_INNER_PRODUCT) {
+                ivfpq.quantizer->reconstruct(key, residual_vec);
+                dis0 = fvec_inner_product(residual_vec, qi, d);
+            } else {
+                ivfpq.quantizer->compute_residual(qi, residual_vec, key);
+            }
+            dvec = residual_vec;
+        } else {
+            dvec = qi;
+            dis0 = 0;
+        }
+        for (size_t j = 0; j < ncode; j++) {
+            pq.decode(codes, decoded_vec);
+            codes += pq.code_size;
+            float dis;
+            if (METRIC_TYPE == METRIC_INNER_PRODUCT) {
+                dis = dis0 + fvec_inner_product(decoded_vec, qi, d);
+            } else {
+                dis = fvec_L2sqr(decoded_vec, dvec, d);
+            }
+            res.add(j, dis);
+        }
+    }
+    /*****************************************************
+     * Scanning codes with polysemous filtering
+     *****************************************************/
+    template <class HammingComputer, class SearchResultType>
+    void scan_list_polysemous_hc(
+            size_t ncode,
+            const uint8_t* codes,
+            SearchResultType& res) const {
+        int ht = ivfpq.polysemous_ht;
+        size_t n_hamming_pass = 0, nup = 0;
+        int code_size = pq.code_size;
+        HammingComputer hc(q_code.data(), code_size);
+        for (size_t j = 0; j < ncode; j++) {
+            const uint8_t* b_code = codes;
+            int hd = hc.hamming(b_code);
+            if (hd < ht) {
+                n_hamming_pass++;
+                PQDecoder decoder(codes, pq.nbits);
+                float dis = dis0;
+                const float* tab = sim_table;
+                for (size_t m = 0; m < pq.M; m++) {
+                    dis += tab[decoder.decode()];
+                    tab += pq.ksub;
+                }
+                res.add(j, dis);
+            }
+            codes += code_size;
+        }
+#pragma omp critical
+        { indexIVFPQ_stats.n_hamming_pass += n_hamming_pass; }
+    }
+    template <class SearchResultType>
+    void scan_list_polysemous(
+            size_t ncode,
+            const uint8_t* codes,
+            SearchResultType& res) const {
+        switch (pq.code_size) {
+#define HANDLE_CODE_SIZE(cs)                                            \
+    case cs:                                                            \
+        scan_list_polysemous_hc<HammingComputer##cs, SearchResultType>( \
+                ncode, codes, res);                                     \
+        break
+            HANDLE_CODE_SIZE(4);
+            HANDLE_CODE_SIZE(8);
+            HANDLE_CODE_SIZE(16);
+            HANDLE_CODE_SIZE(20);
+            HANDLE_CODE_SIZE(32);
+            HANDLE_CODE_SIZE(64);
+#undef HANDLE_CODE_SIZE
+            default:
+                scan_list_polysemous_hc<
+                        HammingComputerDefault,
+                        SearchResultType>(ncode, codes, res);
+                break;
+        }
+    }
+};
+/* We put as many parameters as possible in template. Hopefully the
+ * gain in runtime is worth the code bloat. C is the comparator < or
+ * >, it is directly related to METRIC_TYPE. precompute_mode is how
+ * much we precompute (2 = precompute distance tables, 1 = precompute
+ * pointers to distances, 0 = compute distances one by one).
+ * Currently only 2 is supported */
+template <MetricType METRIC_TYPE, class C, class PQDecoder>
+struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
+                      InvertedListScanner {
+    int precompute_mode;
+    IVFPQScanner(const IndexIVFPQ& ivfpq, bool store_pairs, int precompute_mode)
+            : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>(
+                      ivfpq,
+                      nullptr),
+              precompute_mode(precompute_mode) {
+        this->store_pairs = store_pairs;
+    }
+    void set_query(const float* query) override {
+        this->init_query(query);
+    }
+    void set_list(idx_t list_no, float coarse_dis) override {
+        this->list_no = list_no;
+        this->init_list(list_no, coarse_dis, precompute_mode);
+    }
+    float distance_to_code(const uint8_t* code) const override {
+        assert(precompute_mode == 2);
+        float dis = this->dis0;
+        const float* tab = this->sim_table;
+        PQDecoder decoder(code, this->pq.nbits);
+        for (size_t m = 0; m < this->pq.M; m++) {
+            dis += tab[decoder.decode()];
+            tab += this->pq.ksub;
+        }
+        return dis;
+    }
+    size_t scan_codes(
+            size_t ncode,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float* heap_sim,
+            idx_t* heap_ids,
+            size_t k) const override {
+        KnnSearchResults<C> res = {
+                /* key */ this->key,
+                /* ids */ this->store_pairs ? nullptr : ids,
+                /* k */ k,
+                /* heap_sim */ heap_sim,
+                /* heap_ids */ heap_ids,
+                /* nup */ 0};
+        if (this->polysemous_ht > 0) {
+            assert(precompute_mode == 2);
+            this->scan_list_polysemous(ncode, codes, res);
+        } else if (precompute_mode == 2) {
+            this->scan_list_with_table(ncode, codes, res);
+        } else if (precompute_mode == 1) {
+            this->scan_list_with_pointer(ncode, codes, res);
+        } else if (precompute_mode == 0) {
+            this->scan_on_the_fly_dist(ncode, codes, res);
+        } else {
+            FAISS_THROW_MSG("bad precomp mode");
+        }
+        return res.nup;
+    }
+    void scan_codes_range(
+            size_t ncode,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float radius,
+            RangeQueryResult& rres) const override {
+        RangeSearchResults<C> res = {
+                /* key */ this->key,
+                /* ids */ this->store_pairs ? nullptr : ids,
+                /* radius */ radius,
+                /* rres */ rres};
+        if (this->polysemous_ht > 0) {
+            assert(precompute_mode == 2);
+            this->scan_list_polysemous(ncode, codes, res);
+        } else if (precompute_mode == 2) {
+            this->scan_list_with_table(ncode, codes, res);
+        } else if (precompute_mode == 1) {
+            this->scan_list_with_pointer(ncode, codes, res);
+        } else if (precompute_mode == 0) {
+            this->scan_on_the_fly_dist(ncode, codes, res);
+        } else {
+            FAISS_THROW_MSG("bad precomp mode");
+        }
+    }
+};
+template <class PQDecoder>
+InvertedListScanner* get_InvertedListScanner1(
+        const IndexIVFPQ& index,
+        bool store_pairs) {
+    if (index.metric_type == METRIC_INNER_PRODUCT) {
+        return new IVFPQScanner<
+                METRIC_INNER_PRODUCT,
+                CMin<float, idx_t>,
+                PQDecoder>(index, store_pairs, 2);
+    } else if (index.metric_type == METRIC_L2) {
+        return new IVFPQScanner<METRIC_L2, CMax<float, idx_t>, PQDecoder>(
+                index, store_pairs, 2);
+    }
+    return nullptr;
+}
+} // anonymous namespace
+InvertedListScanner* IndexIVFPQ::get_InvertedListScanner(
+        bool store_pairs) const {
+    if (pq.nbits == 8) {
+        return get_InvertedListScanner1<PQDecoder8>(*this, store_pairs);
+    } else if (pq.nbits == 16) {
+        return get_InvertedListScanner1<PQDecoder16>(*this, store_pairs);
+    } else {
+        return get_InvertedListScanner1<PQDecoderGeneric>(*this, store_pairs);
+    }
+    return nullptr;
+}
+IndexIVFPQStats indexIVFPQ_stats;
+void IndexIVFPQStats::reset() {
+    memset(this, 0, sizeof(*this));
+}
+IndexIVFPQ::IndexIVFPQ() {
+    // initialize some runtime values
+    use_precomputed_table = 0;
+    scan_table_threshold = 0;
+    do_polysemous_training = false;
+    polysemous_ht = 0;
+    polysemous_training = nullptr;
+}
+struct CodeCmp {
+    const uint8_t* tab;
+    size_t code_size;
+    bool operator()(int a, int b) const {
+        return cmp(a, b) > 0;
+    }
+    int cmp(int a, int b) const {
+        return memcmp(tab + a * code_size, tab + b * code_size, code_size);
+    }
+};
+size_t IndexIVFPQ::find_duplicates(idx_t* dup_ids, size_t* lims) const {
+    size_t ngroup = 0;
+    lims[0] = 0;
+    for (size_t list_no = 0; list_no < nlist; list_no++) {
+        size_t n = invlists->list_size(list_no);
+        std::vector<int> ord(n);
+        for (int i = 0; i < n; i++)
+            ord[i] = i;
+        InvertedLists::ScopedCodes codes(invlists, list_no);
+        CodeCmp cs = {codes.get(), code_size};
+        std::sort(ord.begin(), ord.end(), cs);
+        InvertedLists::ScopedIds list_ids(invlists, list_no);
+        int prev = -1; // all elements from prev to i-1 are equal
+        for (int i = 0; i < n; i++) {
+            if (prev >= 0 && cs.cmp(ord[prev], ord[i]) == 0) {
+                // same as previous => remember
+                if (prev + 1 == i) { // start new group
+                    ngroup++;
+                    lims[ngroup] = lims[ngroup - 1];
+                    dup_ids[lims[ngroup]++] = list_ids[ord[prev]];
+                }
+                dup_ids[lims[ngroup]++] = list_ids[ord[i]];
+            } else { // not same as previous.
+                prev = i;
+            }
+        }
+    }
+    return ngroup;
+}
+} // namespace faiss
--- a/faiss/IndexIVFPQ.h
+++ b/faiss/IndexIVFPQ.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_INDEX_IVFPQ_H
+#define FAISS_INDEX_IVFPQ_H
+#include <vector>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/AlignedTable.h>
+namespace faiss {
+struct IVFPQSearchParameters : IVFSearchParameters {
+    size_t scan_table_threshold; ///< use table computation or on-the-fly?
+    int polysemous_ht;           ///< Hamming thresh for polysemous filtering
+    IVFPQSearchParameters() : scan_table_threshold(0), polysemous_ht(0) {}
+    ~IVFPQSearchParameters() {}
+};
+FAISS_API extern size_t precomputed_table_max_bytes;
+/** Inverted file with Product Quantizer encoding. Each residual
+ * vector is encoded as a product quantizer code.
+ */
+struct IndexIVFPQ : IndexIVF {
+    bool by_residual; ///< Encode residual or plain vector?
+    ProductQuantizer pq; ///< produces the codes
+    bool do_polysemous_training; ///< reorder PQ centroids after training?
+    PolysemousTraining* polysemous_training; ///< if NULL, use default
+    // search-time parameters
+    size_t scan_table_threshold; ///< use table computation or on-the-fly?
+    int polysemous_ht;           ///< Hamming thresh for polysemous filtering
+    /** Precompute table that speed up query preprocessing at some
+     * memory cost (used only for by_residual with L2 metric)
+     */
+    int use_precomputed_table;
+    /// if use_precompute_table
+    /// size nlist * pq.M * pq.ksub
+    AlignedTable<float> precomputed_table;
+    IndexIVFPQ(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,
+            size_t nbits_per_idx,
+            MetricType metric = METRIC_L2);
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listnos = false) const override;
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+    void add_core(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx) override;
+    /// same as add_core, also:
+    /// - output 2nd level residuals if residuals_2 != NULL
+    /// - accepts precomputed_idx = nullptr
+    void add_core_o(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            float* residuals_2,
+            const idx_t* precomputed_idx = nullptr);
+    /// trains the product quantizer
+    void train_residual(idx_t n, const float* x) override;
+    /// same as train_residual, also output 2nd level residuals
+    void train_residual_o(idx_t n, const float* x, float* residuals_2);
+    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
+            const override;
+    /** Find exact duplicates in the dataset.
+     *
+     * the duplicates are returned in pre-allocated arrays (see the
+     * max sizes).
+     *
+     * @param lims   limits between groups of duplicates
+     *                (max size ntotal / 2 + 1)
+     * @param ids    ids[lims[i]] : ids[lims[i+1]-1] is a group of
+     *                duplicates (max size ntotal)
+     * @return n      number of groups found
+     */
+    size_t find_duplicates(idx_t* ids, size_t* lims) const;
+    // map a vector to a binary code knowning the index
+    void encode(idx_t key, const float* x, uint8_t* code) const;
+    /** Encode multiple vectors
+     *
+     * @param n       nb vectors to encode
+     * @param keys    posting list ids for those vectors (size n)
+     * @param x       vectors (size n * d)
+     * @param codes   output codes (size n * code_size)
+     * @param compute_keys  if false, assume keys are precomputed,
+     *                      otherwise compute them
+     */
+    void encode_multiple(
+            size_t n,
+            idx_t* keys,
+            const float* x,
+            uint8_t* codes,
+            bool compute_keys = false) const;
+    /// inverse of encode_multiple
+    void decode_multiple(
+            size_t n,
+            const idx_t* keys,
+            const uint8_t* xcodes,
+            float* x) const;
+    InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs) const override;
+    /// build precomputed table
+    void precompute_table();
+    IndexIVFPQ();
+};
+/** Pre-compute distance tables for IVFPQ with by-residual and METRIC_L2
+ *
+ * @param use_precomputed_table (I/O)
+ *        =-1: force disable
+ *        =0: decide heuristically (default: use tables only if they are
+ *            < precomputed_tables_max_bytes), set use_precomputed_table on
+ * output =1: tables that work for all quantizers (size 256 * nlist * M) =2:
+ * specific version for MultiIndexQuantizer (much more compact)
+ * @param precomputed_table precomputed table to initialize
+ */
+void initialize_IVFPQ_precomputed_table(
+        int& use_precomputed_table,
+        const Index* quantizer,
+        const ProductQuantizer& pq,
+        AlignedTable<float>& precomputed_table,
+        bool verbose);
+/// statistics are robust to internal threading, but not if
+/// IndexIVFPQ::search_preassigned is called by multiple threads
+struct IndexIVFPQStats {
+    size_t nrefine; ///< nb of refines (IVFPQR)
+    size_t n_hamming_pass;
+    ///< nb of passed Hamming distance tests (for polysemous)
+    // timings measured with the CPU RTC on all threads
+    size_t search_cycles;
+    size_t refine_cycles; ///< only for IVFPQR
+    IndexIVFPQStats() {
+        reset();
+    }
+    void reset();
+};
+// global var that collects them all
+FAISS_API extern IndexIVFPQStats indexIVFPQ_stats;
+} // namespace faiss
+#endif
--- a/faiss/IndexIVFPQFastScan.cpp
+++ b/faiss/IndexIVFPQFastScan.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <faiss/IndexIVFPQFastScan.h>
+#include <cassert>
+#include <cinttypes>
+#include <cstdio>
+#include <omp.h>
+#include <memory>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/simdlib.h>
+#include <faiss/utils/utils.h>
+#include <faiss/invlists/BlockInvertedLists.h>
+#include <faiss/impl/pq4_fast_scan.h>
+#include <faiss/impl/simd_result_handlers.h>
+#include <faiss/utils/quantize_lut.h>
+namespace faiss {
+using namespace simd_result_handlers;
+inline size_t roundup(size_t a, size_t b) {
+    return (a + b - 1) / b * b;
+}
+IndexIVFPQFastScan::IndexIVFPQFastScan(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t M,
+        size_t nbits_per_idx,
+        MetricType metric,
+        int bbs)
+        : IndexIVF(quantizer, d, nlist, 0, metric),
+          pq(d, M, nbits_per_idx),
+          bbs(bbs) {
+    FAISS_THROW_IF_NOT(nbits_per_idx == 4);
+    M2 = roundup(pq.M, 2);
+    by_residual = false; // set to false by default because it's much faster
+    is_trained = false;
+    code_size = pq.code_size;
+    replace_invlists(new BlockInvertedLists(nlist, bbs, bbs * M2 / 2), true);
+}
+IndexIVFPQFastScan::IndexIVFPQFastScan() {
+    by_residual = false;
+    bbs = 0;
+    M2 = 0;
+}
+IndexIVFPQFastScan::IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs)
+        : IndexIVF(
+                  orig.quantizer,
+                  orig.d,
+                  orig.nlist,
+                  orig.pq.code_size,
+                  orig.metric_type),
+          pq(orig.pq),
+          bbs(bbs) {
+    FAISS_THROW_IF_NOT(orig.pq.nbits == 4);
+    by_residual = orig.by_residual;
+    ntotal = orig.ntotal;
+    is_trained = orig.is_trained;
+    nprobe = orig.nprobe;
+    size_t M = pq.M;
+    M2 = roundup(M, 2);
+    replace_invlists(
+            new BlockInvertedLists(orig.nlist, bbs, bbs * M2 / 2), true);
+    precomputed_table.resize(orig.precomputed_table.size());
+    if (precomputed_table.nbytes() > 0) {
+        memcpy(precomputed_table.get(),
+               orig.precomputed_table.data(),
+               precomputed_table.nbytes());
+    }
+    for (size_t i = 0; i < nlist; i++) {
+        size_t nb = orig.invlists->list_size(i);
+        size_t nb2 = roundup(nb, bbs);
+        AlignedTable<uint8_t> tmp(nb2 * M2 / 2);
+        pq4_pack_codes(
+                InvertedLists::ScopedCodes(orig.invlists, i).get(),
+                nb,
+                M,
+                nb2,
+                bbs,
+                M2,
+                tmp.get());
+        invlists->add_entries(
+                i,
+                nb,
+                InvertedLists::ScopedIds(orig.invlists, i).get(),
+                tmp.get());
+    }
+    orig_invlists = orig.invlists;
+}
+/*********************************************************
+ * Training
+ *********************************************************/
+void IndexIVFPQFastScan::train_residual(idx_t n, const float* x_in) {
+    const float* x = fvecs_maybe_subsample(
+            d,
+            (size_t*)&n,
+            pq.cp.max_points_per_centroid * pq.ksub,
+            x_in,
+            verbose,
+            pq.cp.seed);
+    std::unique_ptr<float[]> del_x;
+    if (x != x_in) {
+        del_x.reset((float*)x);
+    }
+    const float* trainset;
+    AlignedTable<float> residuals;
+    if (by_residual) {
+        if (verbose)
+            printf("computing residuals\n");
+        std::vector<idx_t> assign(n);
+        quantizer->assign(n, x, assign.data());
+        residuals.resize(n * d);
+        for (idx_t i = 0; i < n; i++) {
+            quantizer->compute_residual(
+                    x + i * d, residuals.data() + i * d, assign[i]);
+        }
+        trainset = residuals.data();
+    } else {
+        trainset = x;
+    }
+    if (verbose) {
+        printf("training %zdx%zd product quantizer on "
+               "%" PRId64 " vectors in %dD\n",
+               pq.M,
+               pq.ksub,
+               n,
+               d);
+    }
+    pq.verbose = verbose;
+    pq.train(n, trainset);
+    if (by_residual && metric_type == METRIC_L2) {
+        precompute_table();
+    }
+}
+void IndexIVFPQFastScan::precompute_table() {
+    initialize_IVFPQ_precomputed_table(
+            use_precomputed_table, quantizer, pq, precomputed_table, verbose);
+}
+/*********************************************************
+ * Code management functions
+ *********************************************************/
+void IndexIVFPQFastScan::encode_vectors(
+        idx_t n,
+        const float* x,
+        const idx_t* list_nos,
+        uint8_t* codes,
+        bool include_listnos) const {
+    if (by_residual) {
+        AlignedTable<float> residuals(n * d);
+        for (size_t i = 0; i < n; i++) {
+            if (list_nos[i] < 0) {
+                memset(residuals.data() + i * d, 0, sizeof(residuals[0]) * d);
+            } else {
+                quantizer->compute_residual(
+                        x + i * d, residuals.data() + i * d, list_nos[i]);
+            }
+        }
+        pq.compute_codes(residuals.data(), codes, n);
+    } else {
+        pq.compute_codes(x, codes, n);
+    }
+    if (include_listnos) {
+        size_t coarse_size = coarse_code_size();
+        for (idx_t i = n - 1; i >= 0; i--) {
+            uint8_t* code = codes + i * (coarse_size + code_size);
+            memmove(code + coarse_size, codes + i * code_size, code_size);
+            encode_listno(list_nos[i], code);
+        }
+    }
+}
+void IndexIVFPQFastScan::add_with_ids(
+        idx_t n,
+        const float* x,
+        const idx_t* xids) {
+    // copied from IndexIVF::add_with_ids --->
+    // do some blocking to avoid excessive allocs
+    idx_t bs = 65536;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(n, i0 + bs);
+            if (verbose) {
+                printf("   IndexIVFPQFastScan::add_with_ids %zd: %zd",
+                       size_t(i0),
+                       size_t(i1));
+            }
+            add_with_ids(i1 - i0, x + i0 * d, xids ? xids + i0 : nullptr);
+        }
+        return;
+    }
+    InterruptCallback::check();
+    AlignedTable<uint8_t> codes(n * code_size);
+    FAISS_THROW_IF_NOT(is_trained);
+    direct_map.check_can_add(xids);
+    std::unique_ptr<idx_t[]> idx(new idx_t[n]);
+    quantizer->assign(n, x, idx.get());
+    size_t nadd = 0, nminus1 = 0;
+    for (size_t i = 0; i < n; i++) {
+        if (idx[i] < 0)
+            nminus1++;
+    }
+    AlignedTable<uint8_t> flat_codes(n * code_size);
+    encode_vectors(n, x, idx.get(), flat_codes.get());
+    DirectMapAdd dm_adder(direct_map, n, xids);
+    // <---
+    BlockInvertedLists* bil = dynamic_cast<BlockInvertedLists*>(invlists);
+    FAISS_THROW_IF_NOT_MSG(bil, "only block inverted lists supported");
+    // prepare batches
+    std::vector<idx_t> order(n);
+    for (idx_t i = 0; i < n; i++) {
+        order[i] = i;
+    }
+    // TODO should not need stable
+    std::stable_sort(order.begin(), order.end(), [&idx](idx_t a, idx_t b) {
+        return idx[a] < idx[b];
+    });
+    // TODO parallelize
+    idx_t i0 = 0;
+    while (i0 < n) {
+        idx_t list_no = idx[order[i0]];
+        idx_t i1 = i0 + 1;
+        while (i1 < n && idx[order[i1]] == list_no) {
+            i1++;
+        }
+        if (list_no == -1) {
+            i0 = i1;
+            continue;
+        }
+        // make linear array
+        AlignedTable<uint8_t> list_codes((i1 - i0) * code_size);
+        size_t list_size = bil->list_size(list_no);
+        bil->resize(list_no, list_size + i1 - i0);
+        for (idx_t i = i0; i < i1; i++) {
+            size_t ofs = list_size + i - i0;
+            idx_t id = xids ? xids[order[i]] : ntotal + order[i];
+            dm_adder.add(order[i], list_no, ofs);
+            bil->ids[list_no][ofs] = id;
+            memcpy(list_codes.data() + (i - i0) * code_size,
+                   flat_codes.data() + order[i] * code_size,
+                   code_size);
+            nadd++;
+        }
+        pq4_pack_codes_range(
+                list_codes.data(),
+                pq.M,
+                list_size,
+                list_size + i1 - i0,
+                bbs,
+                M2,
+                bil->codes[list_no].data());
+        i0 = i1;
+    }
+    ntotal += n;
+}
+/*********************************************************
+ * search
+ *********************************************************/
+namespace {
+// from impl/ProductQuantizer.cpp
+template <class C, typename dis_t>
+void pq_estimators_from_tables_generic(
+        const ProductQuantizer& pq,
+        size_t nbits,
+        const uint8_t* codes,
+        size_t ncodes,
+        const dis_t* dis_table,
+        const int64_t* ids,
+        float dis0,
+        size_t k,
+        typename C::T* heap_dis,
+        int64_t* heap_ids) {
+    using accu_t = typename C::T;
+    const size_t M = pq.M;
+    const size_t ksub = pq.ksub;
+    for (size_t j = 0; j < ncodes; ++j) {
+        PQDecoderGeneric decoder(codes + j * pq.code_size, nbits);
+        accu_t dis = dis0;
+        const dis_t* dt = dis_table;
+        for (size_t m = 0; m < M; m++) {
+            uint64_t c = decoder.decode();
+            dis += dt[c];
+            dt += ksub;
+        }
+        if (C::cmp(heap_dis[0], dis)) {
+            heap_pop<C>(k, heap_dis, heap_ids);
+            heap_push<C>(k, heap_dis, heap_ids, dis, ids[j]);
+        }
+    }
+}
+using idx_t = Index::idx_t;
+using namespace quantize_lut;
+void fvec_madd_avx(
+        size_t n,
+        const float* a,
+        float bf,
+        const float* b,
+        float* c) {
+    assert(is_aligned_pointer(a));
+    assert(is_aligned_pointer(b));
+    assert(is_aligned_pointer(c));
+    assert(n % 8 == 0);
+    simd8float32 bf8(bf);
+    n /= 8;
+    for (size_t i = 0; i < n; i++) {
+        simd8float32 ai(a);
+        simd8float32 bi(b);
+        simd8float32 ci = fmadd(bf8, bi, ai);
+        ci.store(c);
+        c += 8;
+        a += 8;
+        b += 8;
+    }
+}
+} // anonymous namespace
+/*********************************************************
+ * Look-Up Table functions
+ *********************************************************/
+void IndexIVFPQFastScan::compute_LUT(
+        size_t n,
+        const float* x,
+        const idx_t* coarse_ids,
+        const float* coarse_dis,
+        AlignedTable<float>& dis_tables,
+        AlignedTable<float>& biases) const {
+    const IndexIVFPQFastScan& ivfpq = *this;
+    size_t dim12 = pq.ksub * pq.M;
+    size_t d = pq.d;
+    size_t nprobe = ivfpq.nprobe;
+    if (ivfpq.by_residual) {
+        if (ivfpq.metric_type == METRIC_L2) {
+            dis_tables.resize(n * nprobe * dim12);
+            if (ivfpq.use_precomputed_table == 1) {
+                biases.resize(n * nprobe);
+                memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
+                AlignedTable<float> ip_table(n * dim12);
+                pq.compute_inner_prod_tables(n, x, ip_table.get());
+#pragma omp parallel for if (n * nprobe > 8000)
+                for (idx_t ij = 0; ij < n * nprobe; ij++) {
+                    idx_t i = ij / nprobe;
+                    float* tab = dis_tables.get() + ij * dim12;
+                    idx_t cij = coarse_ids[ij];
+                    if (cij >= 0) {
+                        fvec_madd_avx(
+                                dim12,
+                                precomputed_table.get() + cij * dim12,
+                                -2,
+                                ip_table.get() + i * dim12,
+                                tab);
+                    } else {
+                        // fill with NaNs so that they are ignored during
+                        // LUT quantization
+                        memset(tab, -1, sizeof(float) * dim12);
+                    }
+                }
+            } else {
+                std::unique_ptr<float[]> xrel(new float[n * nprobe * d]);
+                biases.resize(n * nprobe);
+                memset(biases.get(), 0, sizeof(float) * n * nprobe);
+#pragma omp parallel for if (n * nprobe > 8000)
+                for (idx_t ij = 0; ij < n * nprobe; ij++) {
+                    idx_t i = ij / nprobe;
+                    float* xij = &xrel[ij * d];
+                    idx_t cij = coarse_ids[ij];
+                    if (cij >= 0) {
+                        ivfpq.quantizer->compute_residual(x + i * d, xij, cij);
+                    } else {
+                        // will fill with NaNs
+                        memset(xij, -1, sizeof(float) * d);
+                    }
+                }
+                pq.compute_distance_tables(
+                        n * nprobe, xrel.get(), dis_tables.get());
+            }
+        } else if (ivfpq.metric_type == METRIC_INNER_PRODUCT) {
+            dis_tables.resize(n * dim12);
+            pq.compute_inner_prod_tables(n, x, dis_tables.get());
+            // compute_inner_prod_tables(pq, n, x, dis_tables.get());
+            biases.resize(n * nprobe);
+            memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
+        } else {
+            FAISS_THROW_FMT("metric %d not supported", ivfpq.metric_type);
+        }
+    } else {
+        dis_tables.resize(n * dim12);
+        if (ivfpq.metric_type == METRIC_L2) {
+            pq.compute_distance_tables(n, x, dis_tables.get());
+        } else if (ivfpq.metric_type == METRIC_INNER_PRODUCT) {
+            pq.compute_inner_prod_tables(n, x, dis_tables.get());
+        } else {
+            FAISS_THROW_FMT("metric %d not supported", ivfpq.metric_type);
+        }
+    }
+}
+void IndexIVFPQFastScan::compute_LUT_uint8(
+        size_t n,
+        const float* x,
+        const idx_t* coarse_ids,
+        const float* coarse_dis,
+        AlignedTable<uint8_t>& dis_tables,
+        AlignedTable<uint16_t>& biases,
+        float* normalizers) const {
+    const IndexIVFPQFastScan& ivfpq = *this;
+    AlignedTable<float> dis_tables_float;
+    AlignedTable<float> biases_float;
+    uint64_t t0 = get_cy();
+    compute_LUT(n, x, coarse_ids, coarse_dis, dis_tables_float, biases_float);
+    IVFFastScan_stats.t_compute_distance_tables += get_cy() - t0;
+    bool lut_is_3d = ivfpq.by_residual && ivfpq.metric_type == METRIC_L2;
+    size_t dim123 = pq.ksub * pq.M;
+    size_t dim123_2 = pq.ksub * M2;
+    if (lut_is_3d) {
+        dim123 *= nprobe;
+        dim123_2 *= nprobe;
+    }
+    dis_tables.resize(n * dim123_2);
+    if (biases_float.get()) {
+        biases.resize(n * nprobe);
+    }
+    uint64_t t1 = get_cy();
+#pragma omp parallel for if (n > 100)
+    for (int64_t i = 0; i < n; i++) {
+        const float* t_in = dis_tables_float.get() + i * dim123;
+        const float* b_in = nullptr;
+        uint8_t* t_out = dis_tables.get() + i * dim123_2;
+        uint16_t* b_out = nullptr;
+        if (biases_float.get()) {
+            b_in = biases_float.get() + i * nprobe;
+            b_out = biases.get() + i * nprobe;
+        }
+        quantize_LUT_and_bias(
+                nprobe,
+                pq.M,
+                pq.ksub,
+                lut_is_3d,
+                t_in,
+                b_in,
+                t_out,
+                M2,
+                b_out,
+                normalizers + 2 * i,
+                normalizers + 2 * i + 1);
+    }
+    IVFFastScan_stats.t_round += get_cy() - t1;
+}
+/*********************************************************
+ * Search functions
+ *********************************************************/
+template <bool is_max>
+void IndexIVFPQFastScan::search_dispatch_implem(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    using Cfloat = typename std::conditional<
+            is_max,
+            CMax<float, int64_t>,
+            CMin<float, int64_t>>::type;
+    using C = typename std::conditional<
+            is_max,
+            CMax<uint16_t, int64_t>,
+            CMin<uint16_t, int64_t>>::type;
+    if (n == 0) {
+        return;
+    }
+    // actual implementation used
+    int impl = implem;
+    if (impl == 0) {
+        if (bbs == 32) {
+            impl = 12;
+        } else {
+            impl = 10;
+        }
+        if (k > 20) {
+            impl++;
+        }
+    }
+    if (impl == 1) {
+        search_implem_1<Cfloat>(n, x, k, distances, labels);
+    } else if (impl == 2) {
+        search_implem_2<C>(n, x, k, distances, labels);
+    } else if (impl >= 10 && impl <= 13) {
+        size_t ndis = 0, nlist_visited = 0;
+        if (n < 2) {
+            if (impl == 12 || impl == 13) {
+                search_implem_12<C>(
+                        n,
+                        x,
+                        k,
+                        distances,
+                        labels,
+                        impl,
+                        &ndis,
+                        &nlist_visited);
+            } else {
+                search_implem_10<C>(
+                        n,
+                        x,
+                        k,
+                        distances,
+                        labels,
+                        impl,
+                        &ndis,
+                        &nlist_visited);
+            }
+        } else {
+            // explicitly slice over threads
+            int nslice;
+            if (n <= omp_get_max_threads()) {
+                nslice = n;
+            } else if (by_residual && metric_type == METRIC_L2) {
+                // make sure we don't make too big LUT tables
+                size_t lut_size_per_query = pq.M * pq.ksub * nprobe *
+                        (sizeof(float) + sizeof(uint8_t));
+                size_t max_lut_size = precomputed_table_max_bytes;
+                // how many queries we can handle within mem budget
+                size_t nq_ok =
+                        std::max(max_lut_size / lut_size_per_query, size_t(1));
+                nslice =
+                        roundup(std::max(size_t(n / nq_ok), size_t(1)),
+                                omp_get_max_threads());
+            } else {
+                // LUTs unlikely to be a limiting factor
+                nslice = omp_get_max_threads();
+            }
+#pragma omp parallel for reduction(+ : ndis, nlist_visited)
+            for (int slice = 0; slice < nslice; slice++) {
+                idx_t i0 = n * slice / nslice;
+                idx_t i1 = n * (slice + 1) / nslice;
+                float* dis_i = distances + i0 * k;
+                idx_t* lab_i = labels + i0 * k;
+                if (impl == 12 || impl == 13) {
+                    search_implem_12<C>(
+                            i1 - i0,
+                            x + i0 * d,
+                            k,
+                            dis_i,
+                            lab_i,
+                            impl,
+                            &ndis,
+                            &nlist_visited);
+                } else {
+                    search_implem_10<C>(
+                            i1 - i0,
+                            x + i0 * d,
+                            k,
+                            dis_i,
+                            lab_i,
+                            impl,
+                            &ndis,
+                            &nlist_visited);
+                }
+            }
+        }
+        indexIVF_stats.nq += n;
+        indexIVF_stats.ndis += ndis;
+        indexIVF_stats.nlist += nlist_visited;
+    } else {
+        FAISS_THROW_FMT("implem %d does not exist", implem);
+    }
+}
+void IndexIVFPQFastScan::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    if (metric_type == METRIC_L2) {
+        search_dispatch_implem<true>(n, x, k, distances, labels);
+    } else {
+        search_dispatch_implem<false>(n, x, k, distances, labels);
+    }
+}
+template <class C>
+void IndexIVFPQFastScan::search_implem_1(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(orig_invlists);
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+    quantizer->search(n, x, nprobe, coarse_dis.get(), coarse_ids.get());
+    size_t dim12 = pq.ksub * pq.M;
+    AlignedTable<float> dis_tables;
+    AlignedTable<float> biases;
+    compute_LUT(n, x, coarse_ids.get(), coarse_dis.get(), dis_tables, biases);
+    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
+    size_t ndis = 0, nlist_visited = 0;
+#pragma omp parallel for reduction(+ : ndis, nlist_visited)
+    for (idx_t i = 0; i < n; i++) {
+        int64_t* heap_ids = labels + i * k;
+        float* heap_dis = distances + i * k;
+        heap_heapify<C>(k, heap_dis, heap_ids);
+        float* LUT = nullptr;
+        if (single_LUT) {
+            LUT = dis_tables.get() + i * dim12;
+        }
+        for (idx_t j = 0; j < nprobe; j++) {
+            if (!single_LUT) {
+                LUT = dis_tables.get() + (i * nprobe + j) * dim12;
+            }
+            idx_t list_no = coarse_ids[i * nprobe + j];
+            if (list_no < 0)
+                continue;
+            size_t ls = orig_invlists->list_size(list_no);
+            if (ls == 0)
+                continue;
+            InvertedLists::ScopedCodes codes(orig_invlists, list_no);
+            InvertedLists::ScopedIds ids(orig_invlists, list_no);
+            float bias = biases.get() ? biases[i * nprobe + j] : 0;
+            pq_estimators_from_tables_generic<C>(
+                    pq,
+                    pq.nbits,
+                    codes.get(),
+                    ls,
+                    LUT,
+                    ids.get(),
+                    bias,
+                    k,
+                    heap_dis,
+                    heap_ids);
+            nlist_visited++;
+            ndis++;
+        }
+        heap_reorder<C>(k, heap_dis, heap_ids);
+    }
+    indexIVF_stats.nq += n;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nlist += nlist_visited;
+}
+template <class C>
+void IndexIVFPQFastScan::search_implem_2(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(orig_invlists);
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+    quantizer->search(n, x, nprobe, coarse_dis.get(), coarse_ids.get());
+    size_t dim12 = pq.ksub * M2;
+    AlignedTable<uint8_t> dis_tables;
+    AlignedTable<uint16_t> biases;
+    std::unique_ptr<float[]> normalizers(new float[2 * n]);
+    compute_LUT_uint8(
+            n,
+            x,
+            coarse_ids.get(),
+            coarse_dis.get(),
+            dis_tables,
+            biases,
+            normalizers.get());
+    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
+    size_t ndis = 0, nlist_visited = 0;
+#pragma omp parallel for reduction(+ : ndis, nlist_visited)
+    for (idx_t i = 0; i < n; i++) {
+        std::vector<uint16_t> tmp_dis(k);
+        int64_t* heap_ids = labels + i * k;
+        uint16_t* heap_dis = tmp_dis.data();
+        heap_heapify<C>(k, heap_dis, heap_ids);
+        const uint8_t* LUT = nullptr;
+        if (single_LUT) {
+            LUT = dis_tables.get() + i * dim12;
+        }
+        for (idx_t j = 0; j < nprobe; j++) {
+            if (!single_LUT) {
+                LUT = dis_tables.get() + (i * nprobe + j) * dim12;
+            }
+            idx_t list_no = coarse_ids[i * nprobe + j];
+            if (list_no < 0)
+                continue;
+            size_t ls = orig_invlists->list_size(list_no);
+            if (ls == 0)
+                continue;
+            InvertedLists::ScopedCodes codes(orig_invlists, list_no);
+            InvertedLists::ScopedIds ids(orig_invlists, list_no);
+            uint16_t bias = biases.get() ? biases[i * nprobe + j] : 0;
+            pq_estimators_from_tables_generic<C>(
+                    pq,
+                    pq.nbits,
+                    codes.get(),
+                    ls,
+                    LUT,
+                    ids.get(),
+                    bias,
+                    k,
+                    heap_dis,
+                    heap_ids);
+            nlist_visited++;
+            ndis += ls;
+        }
+        heap_reorder<C>(k, heap_dis, heap_ids);
+        // convert distances to float
+        {
+            float one_a = 1 / normalizers[2 * i], b = normalizers[2 * i + 1];
+            if (skip & 16) {
+                one_a = 1;
+                b = 0;
+            }
+            float* heap_dis_float = distances + i * k;
+            for (int j = 0; j < k; j++) {
+                heap_dis_float[j] = b + heap_dis[j] * one_a;
+            }
+        }
+    }
+    indexIVF_stats.nq += n;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nlist += nlist_visited;
+}
+template <class C>
+void IndexIVFPQFastScan::search_implem_10(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        int impl,
+        size_t* ndis_out,
+        size_t* nlist_out) const {
+    memset(distances, -1, sizeof(float) * k * n);
+    memset(labels, -1, sizeof(idx_t) * k * n);
+    using HeapHC = HeapHandler<C, true>;
+    using ReservoirHC = ReservoirHandler<C, true>;
+    using SingleResultHC = SingleResultHandler<C, true>;
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+    uint64_t times[10];
+    memset(times, 0, sizeof(times));
+    int ti = 0;
+#define TIC times[ti++] = get_cy()
+    TIC;
+    quantizer->search(n, x, nprobe, coarse_dis.get(), coarse_ids.get());
+    TIC;
+    size_t dim12 = pq.ksub * M2;
+    AlignedTable<uint8_t> dis_tables;
+    AlignedTable<uint16_t> biases;
+    std::unique_ptr<float[]> normalizers(new float[2 * n]);
+    compute_LUT_uint8(
+            n,
+            x,
+            coarse_ids.get(),
+            coarse_dis.get(),
+            dis_tables,
+            biases,
+            normalizers.get());
+    TIC;
+    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
+    TIC;
+    size_t ndis = 0, nlist_visited = 0;
+    {
+        AlignedTable<uint16_t> tmp_distances(k);
+        for (idx_t i = 0; i < n; i++) {
+            const uint8_t* LUT = nullptr;
+            int qmap1[1] = {0};
+            std::unique_ptr<SIMDResultHandler<C, true>> handler;
+            if (k == 1) {
+                handler.reset(new SingleResultHC(1, 0));
+            } else if (impl == 10) {
+                handler.reset(new HeapHC(
+                        1, tmp_distances.get(), labels + i * k, k, 0));
+            } else if (impl == 11) {
+                handler.reset(new ReservoirHC(1, 0, k, 2 * k));
+            } else {
+                FAISS_THROW_MSG("invalid");
+            }
+            handler->q_map = qmap1;
+            if (single_LUT) {
+                LUT = dis_tables.get() + i * dim12;
+            }
+            for (idx_t j = 0; j < nprobe; j++) {
+                size_t ij = i * nprobe + j;
+                if (!single_LUT) {
+                    LUT = dis_tables.get() + ij * dim12;
+                }
+                if (biases.get()) {
+                    handler->dbias = biases.get() + ij;
+                }
+                idx_t list_no = coarse_ids[ij];
+                if (list_no < 0)
+                    continue;
+                size_t ls = invlists->list_size(list_no);
+                if (ls == 0)
+                    continue;
+                InvertedLists::ScopedCodes codes(invlists, list_no);
+                InvertedLists::ScopedIds ids(invlists, list_no);
+                handler->ntotal = ls;
+                handler->id_map = ids.get();
+#define DISPATCH(classHC)                                              \
+    if (dynamic_cast<classHC*>(handler.get())) {                       \
+        auto* res = static_cast<classHC*>(handler.get());              \
+        pq4_accumulate_loop(                                           \
+                1, roundup(ls, bbs), bbs, M2, codes.get(), LUT, *res); \
+    }
+                DISPATCH(HeapHC)
+                else DISPATCH(ReservoirHC) else DISPATCH(SingleResultHC)
+#undef DISPATCH
+                        nlist_visited++;
+                ndis++;
+            }
+            handler->to_flat_arrays(
+                    distances + i * k,
+                    labels + i * k,
+                    skip & 16 ? nullptr : normalizers.get() + i * 2);
+        }
+    }
+    *ndis_out = ndis;
+    *nlist_out = nlist;
+}
+template <class C>
+void IndexIVFPQFastScan::search_implem_12(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        int impl,
+        size_t* ndis_out,
+        size_t* nlist_out) const {
+    if (n == 0) { // does not work well with reservoir
+        return;
+    }
+    FAISS_THROW_IF_NOT(bbs == 32);
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+    uint64_t times[10];
+    memset(times, 0, sizeof(times));
+    int ti = 0;
+#define TIC times[ti++] = get_cy()
+    TIC;
+    quantizer->search(n, x, nprobe, coarse_dis.get(), coarse_ids.get());
+    TIC;
+    size_t dim12 = pq.ksub * M2;
+    AlignedTable<uint8_t> dis_tables;
+    AlignedTable<uint16_t> biases;
+    std::unique_ptr<float[]> normalizers(new float[2 * n]);
+    compute_LUT_uint8(
+            n,
+            x,
+            coarse_ids.get(),
+            coarse_dis.get(),
+            dis_tables,
+            biases,
+            normalizers.get());
+    TIC;
+    struct QC {
+        int qno;     // sequence number of the query
+        int list_no; // list to visit
+        int rank;    // this is the rank'th result of the coarse quantizer
+    };
+    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
+    std::vector<QC> qcs;
+    {
+        int ij = 0;
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < nprobe; j++) {
+                if (coarse_ids[ij] >= 0) {
+                    qcs.push_back(QC{i, int(coarse_ids[ij]), int(j)});
+                }
+                ij++;
+            }
+        }
+        std::sort(qcs.begin(), qcs.end(), [](const QC& a, const QC& b) {
+            return a.list_no < b.list_no;
+        });
+    }
+    TIC;
+    // prepare the result handlers
+    std::unique_ptr<SIMDResultHandler<C, true>> handler;
+    AlignedTable<uint16_t> tmp_distances;
+    using HeapHC = HeapHandler<C, true>;
+    using ReservoirHC = ReservoirHandler<C, true>;
+    using SingleResultHC = SingleResultHandler<C, true>;
+    if (k == 1) {
+        handler.reset(new SingleResultHC(n, 0));
+    } else if (impl == 12) {
+        tmp_distances.resize(n * k);
+        handler.reset(new HeapHC(n, tmp_distances.get(), labels, k, 0));
+    } else if (impl == 13) {
+        handler.reset(new ReservoirHC(n, 0, k, 2 * k));
+    }
+    int qbs2 = this->qbs2 ? this->qbs2 : 11;
+    std::vector<uint16_t> tmp_bias;
+    if (biases.get()) {
+        tmp_bias.resize(qbs2);
+        handler->dbias = tmp_bias.data();
+    }
+    TIC;
+    size_t ndis = 0;
+    size_t i0 = 0;
+    uint64_t t_copy_pack = 0, t_scan = 0;
+    while (i0 < qcs.size()) {
+        uint64_t tt0 = get_cy();
+        // find all queries that access this inverted list
+        int list_no = qcs[i0].list_no;
+        size_t i1 = i0 + 1;
+        while (i1 < qcs.size() && i1 < i0 + qbs2) {
+            if (qcs[i1].list_no != list_no) {
+                break;
+            }
+            i1++;
+        }
+        size_t list_size = invlists->list_size(list_no);
+        if (list_size == 0) {
+            i0 = i1;
+            continue;
+        }
+        // re-organize LUTs and biases into the right order
+        int nc = i1 - i0;
+        std::vector<int> q_map(nc), lut_entries(nc);
+        AlignedTable<uint8_t> LUT(nc * dim12);
+        memset(LUT.get(), -1, nc * dim12);
+        int qbs = pq4_preferred_qbs(nc);
+        for (size_t i = i0; i < i1; i++) {
+            const QC& qc = qcs[i];
+            q_map[i - i0] = qc.qno;
+            int ij = qc.qno * nprobe + qc.rank;
+            lut_entries[i - i0] = single_LUT ? qc.qno : ij;
+            if (biases.get()) {
+                tmp_bias[i - i0] = biases[ij];
+            }
+        }
+        pq4_pack_LUT_qbs_q_map(
+                qbs, M2, dis_tables.get(), lut_entries.data(), LUT.get());
+        // access the inverted list
+        ndis += (i1 - i0) * list_size;
+        InvertedLists::ScopedCodes codes(invlists, list_no);
+        InvertedLists::ScopedIds ids(invlists, list_no);
+        // prepare the handler
+        handler->ntotal = list_size;
+        handler->q_map = q_map.data();
+        handler->id_map = ids.get();
+        uint64_t tt1 = get_cy();
+#define DISPATCH(classHC)                                          \
+    if (dynamic_cast<classHC*>(handler.get())) {                   \
+        auto* res = static_cast<classHC*>(handler.get());          \
+        pq4_accumulate_loop_qbs(                                   \
+                qbs, list_size, M2, codes.get(), LUT.get(), *res); \
+    }
+        DISPATCH(HeapHC)
+        else DISPATCH(ReservoirHC) else DISPATCH(SingleResultHC)
+                // prepare for next loop
+                i0 = i1;
+        uint64_t tt2 = get_cy();
+        t_copy_pack += tt1 - tt0;
+        t_scan += tt2 - tt1;
+    }
+    TIC;
+    // labels is in-place for HeapHC
+    handler->to_flat_arrays(
+            distances, labels, skip & 16 ? nullptr : normalizers.get());
+    TIC;
+    // these stats are not thread-safe
+    for (int i = 1; i < ti; i++) {
+        IVFFastScan_stats.times[i] += times[i] - times[i - 1];
+    }
+    IVFFastScan_stats.t_copy_pack += t_copy_pack;
+    IVFFastScan_stats.t_scan += t_scan;
+    if (auto* rh = dynamic_cast<ReservoirHC*>(handler.get())) {
+        for (int i = 0; i < 4; i++) {
+            IVFFastScan_stats.reservoir_times[i] += rh->times[i];
+        }
+    }
+    *ndis_out = ndis;
+    *nlist_out = nlist;
+}
+IVFFastScanStats IVFFastScan_stats;
+} // namespace faiss
--- a/faiss/IndexIVFPQFastScan.h
+++ b/faiss/IndexIVFPQFastScan.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <memory>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/utils/AlignedTable.h>
+namespace faiss {
+/** Fast scan version of IVFPQ. Works for 4-bit PQ for now.
+ *
+ * The codes in the inverted lists are not stored sequentially but
+ * grouped in blocks of size bbs. This makes it possible to very quickly
+ * compute distances with SIMD instructions.
+ *
+ * Implementations (implem):
+ * 0: auto-select implementation (default)
+ * 1: orig's search, re-implemented
+ * 2: orig's search, re-ordered by invlist
+ * 10: optimizer int16 search, collect results in heap, no qbs
+ * 11: idem, collect results in reservoir
+ * 12: optimizer int16 search, collect results in heap, uses qbs
+ * 13: idem, collect results in reservoir
+ */
+struct IndexIVFPQFastScan : IndexIVF {
+    bool by_residual;    ///< Encode residual or plain vector?
+    ProductQuantizer pq; ///< produces the codes
+    // size of the kernel
+    int bbs; // set at build time
+    // M rounded up to a multiple of 2
+    size_t M2;
+    /// precomputed tables management
+    int use_precomputed_table = 0;
+    /// if use_precompute_table size (nlist, pq.M, pq.ksub)
+    AlignedTable<float> precomputed_table;
+    // search-time implementation
+    int implem = 0;
+    // skip some parts of the computation (for timing)
+    int skip = 0;
+    // batching factors at search time (0 = default)
+    int qbs = 0;
+    size_t qbs2 = 0;
+    IndexIVFPQFastScan(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,
+            size_t nbits_per_idx,
+            MetricType metric = METRIC_L2,
+            int bbs = 32);
+    IndexIVFPQFastScan();
+    // built from an IndexIVFPQ
+    explicit IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs = 32);
+    /// orig's inverted lists (for debugging)
+    InvertedLists* orig_invlists = nullptr;
+    void train_residual(idx_t n, const float* x) override;
+    /// build precomputed table, possibly updating use_precomputed_table
+    void precompute_table();
+    /// same as the regular IVFPQ encoder. The codes are not reorganized by
+    /// blocks a that point
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listno = false) const override;
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+    // prepare look-up tables
+    void compute_LUT(
+            size_t n,
+            const float* x,
+            const idx_t* coarse_ids,
+            const float* coarse_dis,
+            AlignedTable<float>& dis_tables,
+            AlignedTable<float>& biases) const;
+    void compute_LUT_uint8(
+            size_t n,
+            const float* x,
+            const idx_t* coarse_ids,
+            const float* coarse_dis,
+            AlignedTable<uint8_t>& dis_tables,
+            AlignedTable<uint16_t>& biases,
+            float* normalizers) const;
+    // internal search funcs
+    template <bool is_max>
+    void search_dispatch_implem(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const;
+    template <class C>
+    void search_implem_1(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const;
+    template <class C>
+    void search_implem_2(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const;
+    // implem 10 and 12 are not multithreaded internally, so
+    // export search stats
+    template <class C>
+    void search_implem_10(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            int impl,
+            size_t* ndis_out,
+            size_t* nlist_out) const;
+    template <class C>
+    void search_implem_12(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            int impl,
+            size_t* ndis_out,
+            size_t* nlist_out) const;
+};
+struct IVFFastScanStats {
+    uint64_t times[10];
+    uint64_t t_compute_distance_tables, t_round;
+    uint64_t t_copy_pack, t_scan, t_to_flat;
+    uint64_t reservoir_times[4];
+    double Mcy_at(int i) {
+        return times[i] / (1000 * 1000.0);
+    }
+    double Mcy_reservoir_at(int i) {
+        return reservoir_times[i] / (1000 * 1000.0);
+    }
+    IVFFastScanStats() {
+        reset();
+    }
+    void reset() {
+        memset(this, 0, sizeof(*this));
+    }
+};
+FAISS_API extern IVFFastScanStats IVFFastScan_stats;
+} // namespace faiss
--- a/faiss/IndexIVFPQR.cpp
+++ b/faiss/IndexIVFPQR.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexIVFPQR.h>
+#include <cinttypes>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/utils.h>
+#include <faiss/impl/FaissAssert.h>
+namespace faiss {
+/*****************************************
+ * IndexIVFPQR implementation
+ ******************************************/
+IndexIVFPQR::IndexIVFPQR(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t M,
+        size_t nbits_per_idx,
+        size_t M_refine,
+        size_t nbits_per_idx_refine)
+        : IndexIVFPQ(quantizer, d, nlist, M, nbits_per_idx),
+          refine_pq(d, M_refine, nbits_per_idx_refine),
+          k_factor(4) {
+    by_residual = true;
+}
+IndexIVFPQR::IndexIVFPQR() : k_factor(1) {
+    by_residual = true;
+}
+void IndexIVFPQR::reset() {
+    IndexIVFPQ::reset();
+    refine_codes.clear();
+}
+void IndexIVFPQR::train_residual(idx_t n, const float* x) {
+    float* residual_2 = new float[n * d];
+    ScopeDeleter<float> del(residual_2);
+    train_residual_o(n, x, residual_2);
+    if (verbose)
+        printf("training %zdx%zd 2nd level PQ quantizer on %" PRId64
+               " %dD-vectors\n",
+               refine_pq.M,
+               refine_pq.ksub,
+               n,
+               d);
+    refine_pq.cp.max_points_per_centroid = 1000;
+    refine_pq.cp.verbose = verbose;
+    refine_pq.train(n, residual_2);
+}
+void IndexIVFPQR::add_with_ids(idx_t n, const float* x, const idx_t* xids) {
+    add_core(n, x, xids, nullptr);
+}
+void IndexIVFPQR::add_core(
+        idx_t n,
+        const float* x,
+        const idx_t* xids,
+        const idx_t* precomputed_idx) {
+    float* residual_2 = new float[n * d];
+    ScopeDeleter<float> del(residual_2);
+    idx_t n0 = ntotal;
+    add_core_o(n, x, xids, residual_2, precomputed_idx);
+    refine_codes.resize(ntotal * refine_pq.code_size);
+    refine_pq.compute_codes(
+            residual_2, &refine_codes[n0 * refine_pq.code_size], n);
+}
+#define TIC t0 = get_cycles()
+#define TOC get_cycles() - t0
+void IndexIVFPQR::search_preassigned(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        const idx_t* idx,
+        const float* L1_dis,
+        float* distances,
+        idx_t* labels,
+        bool store_pairs,
+        const IVFSearchParameters* params,
+        IndexIVFStats* stats) const {
+    uint64_t t0;
+    TIC;
+    size_t k_coarse = long(k * k_factor);
+    idx_t* coarse_labels = new idx_t[k_coarse * n];
+    ScopeDeleter<idx_t> del1(coarse_labels);
+    { // query with quantizer levels 1 and 2.
+        float* coarse_distances = new float[k_coarse * n];
+        ScopeDeleter<float> del(coarse_distances);
+        IndexIVFPQ::search_preassigned(
+                n,
+                x,
+                k_coarse,
+                idx,
+                L1_dis,
+                coarse_distances,
+                coarse_labels,
+                true,
+                params);
+    }
+    indexIVFPQ_stats.search_cycles += TOC;
+    TIC;
+    // 3rd level refinement
+    size_t n_refine = 0;
+#pragma omp parallel reduction(+ : n_refine)
+    {
+        // tmp buffers
+        float* residual_1 = new float[2 * d];
+        ScopeDeleter<float> del(residual_1);
+        float* residual_2 = residual_1 + d;
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const float* xq = x + i * d;
+            const idx_t* shortlist = coarse_labels + k_coarse * i;
+            float* heap_sim = distances + k * i;
+            idx_t* heap_ids = labels + k * i;
+            maxheap_heapify(k, heap_sim, heap_ids);
+            for (int j = 0; j < k_coarse; j++) {
+                idx_t sl = shortlist[j];
+                if (sl == -1)
+                    continue;
+                int list_no = lo_listno(sl);
+                int ofs = lo_offset(sl);
+                assert(list_no >= 0 && list_no < nlist);
+                assert(ofs >= 0 && ofs < invlists->list_size(list_no));
+                // 1st level residual
+                quantizer->compute_residual(xq, residual_1, list_no);
+                // 2nd level residual
+                const uint8_t* l2code = invlists->get_single_code(list_no, ofs);
+                pq.decode(l2code, residual_2);
+                for (int l = 0; l < d; l++)
+                    residual_2[l] = residual_1[l] - residual_2[l];
+                // 3rd level residual's approximation
+                idx_t id = invlists->get_single_id(list_no, ofs);
+                assert(0 <= id && id < ntotal);
+                refine_pq.decode(
+                        &refine_codes[id * refine_pq.code_size], residual_1);
+                float dis = fvec_L2sqr(residual_1, residual_2, d);
+                if (dis < heap_sim[0]) {
+                    idx_t id_or_pair = store_pairs ? sl : id;
+                    maxheap_replace_top(k, heap_sim, heap_ids, dis, id_or_pair);
+                }
+                n_refine++;
+            }
+            maxheap_reorder(k, heap_sim, heap_ids);
+        }
+    }
+    indexIVFPQ_stats.nrefine += n_refine;
+    indexIVFPQ_stats.refine_cycles += TOC;
+}
+void IndexIVFPQR::reconstruct_from_offset(
+        int64_t list_no,
+        int64_t offset,
+        float* recons) const {
+    IndexIVFPQ::reconstruct_from_offset(list_no, offset, recons);
+    idx_t id = invlists->get_single_id(list_no, offset);
+    assert(0 <= id && id < ntotal);
+    std::vector<float> r3(d);
+    refine_pq.decode(&refine_codes[id * refine_pq.code_size], r3.data());
+    for (int i = 0; i < d; ++i) {
+        recons[i] += r3[i];
+    }
+}
+void IndexIVFPQR::merge_from(IndexIVF& other_in, idx_t add_id) {
+    IndexIVFPQR* other = dynamic_cast<IndexIVFPQR*>(&other_in);
+    FAISS_THROW_IF_NOT(other);
+    IndexIVF::merge_from(other_in, add_id);
+    refine_codes.insert(
+            refine_codes.end(),
+            other->refine_codes.begin(),
+            other->refine_codes.end());
+    other->refine_codes.clear();
+}
+size_t IndexIVFPQR::remove_ids(const IDSelector& /*sel*/) {
+    FAISS_THROW_MSG("not implemented");
+    return 0;
+}
+} // namespace faiss
--- a/faiss/IndexIVFPQR.h
+++ b/faiss/IndexIVFPQR.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#pragma once
+#include <vector>
+#include <faiss/IndexIVFPQ.h>
+namespace faiss {
+/** Index with an additional level of PQ refinement */
+struct IndexIVFPQR : IndexIVFPQ {
+    ProductQuantizer refine_pq;        ///< 3rd level quantizer
+    std::vector<uint8_t> refine_codes; ///< corresponding codes
+    /// factor between k requested in search and the k requested from the IVFPQ
+    float k_factor;
+    IndexIVFPQR(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,
+            size_t nbits_per_idx,
+            size_t M_refine,
+            size_t nbits_per_idx_refine);
+    void reset() override;
+    size_t remove_ids(const IDSelector& sel) override;
+    /// trains the two product quantizers
+    void train_residual(idx_t n, const float* x) override;
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
+    /// same as add_with_ids, but optionally use the precomputed list ids
+    void add_core(
+            idx_t n,
+            const float* x,
+            const idx_t* xids,
+            const idx_t* precomputed_idx) override;
+    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
+            const override;
+    void merge_from(IndexIVF& other, idx_t add_id) override;
+    void search_preassigned(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            const idx_t* assign,
+            const float* centroid_dis,
+            float* distances,
+            idx_t* labels,
+            bool store_pairs,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const override;
+    IndexIVFPQR();
+};
+} // namespace faiss
--- a/faiss/IndexIVFSpectralHash.cpp
+++ b/faiss/IndexIVFSpectralHash.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#include <faiss/IndexIVFSpectralHash.h>
+#include <stdint.h>
+#include <algorithm>
+#include <memory>
+#include <faiss/IndexLSH.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+namespace faiss {
+IndexIVFSpectralHash::IndexIVFSpectralHash(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        int nbit,
+        float period)
+        : IndexIVF(quantizer, d, nlist, (nbit + 7) / 8, METRIC_L2),
+          nbit(nbit),
+          period(period),
+          threshold_type(Thresh_global) {
+    RandomRotationMatrix* rr = new RandomRotationMatrix(d, nbit);
+    rr->init(1234);
+    vt = rr;
+    own_fields = true;
+    is_trained = false;
+}
+IndexIVFSpectralHash::IndexIVFSpectralHash()
+        : IndexIVF(),
+          vt(nullptr),
+          own_fields(false),
+          nbit(0),
+          period(0),
+          threshold_type(Thresh_global) {}
+IndexIVFSpectralHash::~IndexIVFSpectralHash() {
+    if (own_fields) {
+        delete vt;
+    }
+}
+namespace {
+float median(size_t n, float* x) {
+    std::sort(x, x + n);
+    if (n % 2 == 1) {
+        return x[n / 2];
+    } else {
+        return (x[n / 2 - 1] + x[n / 2]) / 2;
+    }
+}
+} // namespace
+void IndexIVFSpectralHash::train_residual(idx_t n, const float* x) {
+    if (!vt->is_trained) {
+        vt->train(n, x);
+    }
+    if (threshold_type == Thresh_global) {
+        // nothing to do
+        return;
+    } else if (
+            threshold_type == Thresh_centroid ||
+            threshold_type == Thresh_centroid_half) {
+        // convert all centroids with vt
+        std::vector<float> centroids(nlist * d);
+        quantizer->reconstruct_n(0, nlist, centroids.data());
+        trained.resize(nlist * nbit);
+        vt->apply_noalloc(nlist, centroids.data(), trained.data());
+        if (threshold_type == Thresh_centroid_half) {
+            for (size_t i = 0; i < nlist * nbit; i++) {
+                trained[i] -= 0.25 * period;
+            }
+        }
+        return;
+    }
+    // otherwise train medians
+    // assign
+    std::unique_ptr<idx_t[]> idx(new idx_t[n]);
+    quantizer->assign(n, x, idx.get());
+    std::vector<size_t> sizes(nlist + 1);
+    for (size_t i = 0; i < n; i++) {
+        FAISS_THROW_IF_NOT(idx[i] >= 0);
+        sizes[idx[i]]++;
+    }
+    size_t ofs = 0;
+    for (int j = 0; j < nlist; j++) {
+        size_t o0 = ofs;
+        ofs += sizes[j];
+        sizes[j] = o0;
+    }
+    // transform
+    std::unique_ptr<float[]> xt(vt->apply(n, x));
+    // transpose + reorder
+    std::unique_ptr<float[]> xo(new float[n * nbit]);
+    for (size_t i = 0; i < n; i++) {
+        size_t idest = sizes[idx[i]]++;
+        for (size_t j = 0; j < nbit; j++) {
+            xo[idest + n * j] = xt[i * nbit + j];
+        }
+    }
+    trained.resize(n * nbit);
+    // compute medians
+#pragma omp for
+    for (int i = 0; i < nlist; i++) {
+        size_t i0 = i == 0 ? 0 : sizes[i - 1];
+        size_t i1 = sizes[i];
+        for (int j = 0; j < nbit; j++) {
+            float* xoi = xo.get() + i0 + n * j;
+            if (i0 == i1) { // nothing to train
+                trained[i * nbit + j] = 0.0;
+            } else if (i1 == i0 + 1) {
+                trained[i * nbit + j] = xoi[0];
+            } else {
+                trained[i * nbit + j] = median(i1 - i0, xoi);
+            }
+        }
+    }
+}
+namespace {
+void binarize_with_freq(
+        size_t nbit,
+        float freq,
+        const float* x,
+        const float* c,
+        uint8_t* codes) {
+    memset(codes, 0, (nbit + 7) / 8);
+    for (size_t i = 0; i < nbit; i++) {
+        float xf = (x[i] - c[i]);
+        int64_t xi = int64_t(floor(xf * freq));
+        int64_t bit = xi & 1;
+        codes[i >> 3] |= bit << (i & 7);
+    }
+}
+}; // namespace
+void IndexIVFSpectralHash::encode_vectors(
+        idx_t n,
+        const float* x_in,
+        const idx_t* list_nos,
+        uint8_t* codes,
+        bool include_listnos) const {
+    FAISS_THROW_IF_NOT(is_trained);
+    float freq = 2.0 / period;
+    size_t coarse_size = include_listnos ? coarse_code_size() : 0;
+    // transform with vt
+    std::unique_ptr<float[]> x(vt->apply(n, x_in));
+    std::vector<float> zero(nbit);
+#pragma omp for
+    for (idx_t i = 0; i < n; i++) {
+        int64_t list_no = list_nos[i];
+        uint8_t* code = codes + i * (code_size + coarse_size);
+        if (list_no >= 0) {
+            if (coarse_size) {
+                encode_listno(list_no, code);
+            }
+            const float* c;
+            if (threshold_type == Thresh_global) {
+                c = zero.data();
+            } else {
+                c = trained.data() + list_no * nbit;
+            }
+            binarize_with_freq(
+                    nbit, freq, x.get() + i * nbit, c, code + coarse_size);
+        } else {
+            memset(code, 0, code_size + coarse_size);
+        }
+    }
+}
+namespace {
+template <class HammingComputer>
+struct IVFScanner : InvertedListScanner {
+    // copied from index structure
+    const IndexIVFSpectralHash* index;
+    size_t nbit;
+    float period, freq;
+    std::vector<float> q;
+    std::vector<float> zero;
+    std::vector<uint8_t> qcode;
+    HammingComputer hc;
+    using idx_t = Index::idx_t;
+    IVFScanner(const IndexIVFSpectralHash* index, bool store_pairs)
+            : index(index),
+              nbit(index->nbit),
+              period(index->period),
+              freq(2.0 / index->period),
+              q(nbit),
+              zero(nbit),
+              qcode(index->code_size),
+              hc(qcode.data(), index->code_size) {
+        this->store_pairs = store_pairs;
+        this->code_size = index->code_size;
+    }
+    void set_query(const float* query) override {
+        FAISS_THROW_IF_NOT(query);
+        FAISS_THROW_IF_NOT(q.size() == nbit);
+        index->vt->apply_noalloc(1, query, q.data());
+        if (index->threshold_type == IndexIVFSpectralHash::Thresh_global) {
+            binarize_with_freq(nbit, freq, q.data(), zero.data(), qcode.data());
+            hc.set(qcode.data(), code_size);
+        }
+    }
+    void set_list(idx_t list_no, float /*coarse_dis*/) override {
+        this->list_no = list_no;
+        if (index->threshold_type != IndexIVFSpectralHash::Thresh_global) {
+            const float* c = index->trained.data() + list_no * nbit;
+            binarize_with_freq(nbit, freq, q.data(), c, qcode.data());
+            hc.set(qcode.data(), code_size);
+        }
+    }
+    float distance_to_code(const uint8_t* code) const final {
+        return hc.hamming(code);
+    }
+    size_t scan_codes(
+            size_t list_size,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float* simi,
+            idx_t* idxi,
+            size_t k) const override {
+        size_t nup = 0;
+        for (size_t j = 0; j < list_size; j++) {
+            float dis = hc.hamming(codes);
+            if (dis < simi[0]) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                maxheap_replace_top(k, simi, idxi, dis, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+        return nup;
+    }
+    void scan_codes_range(
+            size_t list_size,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float radius,
+            RangeQueryResult& res) const override {
+        for (size_t j = 0; j < list_size; j++) {
+            float dis = hc.hamming(codes);
+            if (dis < radius) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                res.add(dis, id);
+            }
+            codes += code_size;
+        }
+    }
+};
+} // anonymous namespace
+InvertedListScanner* IndexIVFSpectralHash::get_InvertedListScanner(
+        bool store_pairs) const {
+    switch (code_size) {
+#define HANDLE_CODE_SIZE(cs) \
+    case cs:                 \
+        return new IVFScanner<HammingComputer##cs>(this, store_pairs)
+        HANDLE_CODE_SIZE(4);
+        HANDLE_CODE_SIZE(8);
+        HANDLE_CODE_SIZE(16);
+        HANDLE_CODE_SIZE(20);
+        HANDLE_CODE_SIZE(32);
+        HANDLE_CODE_SIZE(64);
+#undef HANDLE_CODE_SIZE
+        default:
+            return new IVFScanner<HammingComputerDefault>(this, store_pairs);
+    }
+}
+void IndexIVFSpectralHash::replace_vt(VectorTransform* vt_in, bool own) {
+    FAISS_THROW_IF_NOT(vt_in->d_out == nbit);
+    FAISS_THROW_IF_NOT(vt_in->d_in == d);
+    if (own_fields) {
+        delete vt;
+    }
+    vt = vt_in;
+    threshold_type = Thresh_global;
+    is_trained = quantizer->is_trained && quantizer->ntotal == nlist &&
+            vt->is_trained;
+    own_fields = own;
+}
+/*
+    Check that the encoder is a single vector transform followed by a LSH
+    that just does thresholding.
+    If this is not the case, the linear transform + threhsolds of the IndexLSH
+    should be merged into the VectorTransform (which is feasible).
+*/
+void IndexIVFSpectralHash::replace_vt(IndexPreTransform* encoder, bool own) {
+    FAISS_THROW_IF_NOT(encoder->chain.size() == 1);
+    auto sub_index = dynamic_cast<IndexLSH*>(encoder->index);
+    FAISS_THROW_IF_NOT_MSG(sub_index, "final index should be LSH");
+    FAISS_THROW_IF_NOT(sub_index->nbits == nbit);
+    FAISS_THROW_IF_NOT(!sub_index->rotate_data);
+    FAISS_THROW_IF_NOT(!sub_index->train_thresholds);
+    replace_vt(encoder->chain[0], own);
+}
+} // namespace faiss
--- a/faiss/IndexIVFSpectralHash.h
+++ b/faiss/IndexIVFSpectralHash.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// -*- c++ -*-
+#ifndef FAISS_INDEX_IVFSH_H
+#define FAISS_INDEX_IVFSH_H
+#include <vector>
+#include <faiss/IndexIVF.h>
+namespace faiss {
+struct VectorTransform;
+struct IndexPreTransform;
+/** Inverted list that stores binary codes of size nbit. Before the
+ * binary conversion, the dimension of the vectors is transformed from
+ * dim d into dim nbit by vt (a random rotation by default).
+ *
+ * Each coordinate is subtracted from a value determined by
+ * threshold_type, and split into intervals of size period. Half of
+ * the interval is a 0 bit, the other half a 1.
+ *
+ */
+struct IndexIVFSpectralHash : IndexIVF {
+    /// transformation from d to nbit dim
+    VectorTransform* vt;
+    /// own the vt
+    bool own_fields;
+    /// nb of bits of the binary signature
+    int nbit;
+    /// interval size for 0s and 1s
+    float period;
+    enum ThresholdType {
+        Thresh_global,        ///< global threshold at 0
+        Thresh_centroid,      ///< compare to centroid
+        Thresh_centroid_half, ///< central interval around centroid
+        Thresh_median         ///< median of training set
+    };
+    ThresholdType threshold_type;
+    /// Trained threshold.
+    /// size nlist * nbit or 0 if Thresh_global
+    std::vector<float> trained;
+    IndexIVFSpectralHash(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            int nbit,
+            float period);
+    IndexIVFSpectralHash();
+    void train_residual(idx_t n, const float* x) override;
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listnos = false) const override;
+    InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs) const override;
+    /** replace the vector transform for an empty (and possibly untrained) index
+     */
+    void replace_vt(VectorTransform* vt, bool own = false);
+    /** convenience function to get the VT from an index constucted by an
+     * index_factory (should end in "LSH") */
+    void replace_vt(IndexPreTransform* index, bool own = false);
+    ~IndexIVFSpectralHash() override;
+};
+} // namespace faiss
+#endif