init the faiss for rocm

395d2ce6 · huchen · 5ded39f5 · 395d2ce6 · 395d2ce6 · 395d2ce6
Commit 395d2ce6 authored Jun 01, 2022 by huchen
20 changed files
--- a/faiss/IndexShards.cpp
+++ b/faiss/IndexShards.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexShards.h>
+
+#include <cinttypes>
+#include <cstdio>
+#include <functional>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/WorkerThread.h>
+
+namespace faiss {
+
+// subroutines
+namespace {
+
+typedef Index::idx_t idx_t;
+
+// add translation to all valid labels
+void translate_labels(long n, idx_t* labels, long translation) {
+    if (translation == 0)
+        return;
+    for (long i = 0; i < n; i++) {
+        if (labels[i] < 0)
+            continue;
+        labels[i] += translation;
+    }
+}
+
+/** merge result tables from several shards.
+ * @param all_distances  size nshard * n * k
+ * @param all_labels     idem
+ * @param translartions  label translations to apply, size nshard
+ */
+
+template <class IndexClass, class C>
+void merge_tables(
+        long n,
+        long k,
+        long nshard,
+        typename IndexClass::distance_t* distances,
+        idx_t* labels,
+        const std::vector<typename IndexClass::distance_t>& all_distances,
+        const std::vector<idx_t>& all_labels,
+        const std::vector<long>& translations) {
+    if (k == 0) {
+        return;
+    }
+    using distance_t = typename IndexClass::distance_t;
+
+    long stride = n * k;
+#pragma omp parallel
+    {
+        std::vector<int> buf(2 * nshard);
+        int* pointer = buf.data();
+        int* shard_ids = pointer + nshard;
+        std::vector<distance_t> buf2(nshard);
+        distance_t* heap_vals = buf2.data();
+#pragma omp for
+        for (long i = 0; i < n; i++) {
+            // the heap maps values to the shard where they are
+            // produced.
+            const distance_t* D_in = all_distances.data() + i * k;
+            const idx_t* I_in = all_labels.data() + i * k;
+            int heap_size = 0;
+
+            for (long s = 0; s < nshard; s++) {
+                pointer[s] = 0;
+                if (I_in[stride * s] >= 0) {
+                    heap_push<C>(
+                            ++heap_size,
+                            heap_vals,
+                            shard_ids,
+                            D_in[stride * s],
+                            s);
+                }
+            }
+
+            distance_t* D = distances + i * k;
+            idx_t* I = labels + i * k;
+
+            for (int j = 0; j < k; j++) {
+                if (heap_size == 0) {
+                    I[j] = -1;
+                    D[j] = C::neutral();
+                } else {
+                    // pop best element
+                    int s = shard_ids[0];
+                    int& p = pointer[s];
+                    D[j] = heap_vals[0];
+                    I[j] = I_in[stride * s + p] + translations[s];
+
+                    heap_pop<C>(heap_size--, heap_vals, shard_ids);
+                    p++;
+                    if (p < k && I_in[stride * s + p] >= 0) {
+                        heap_push<C>(
+                                ++heap_size,
+                                heap_vals,
+                                shard_ids,
+                                D_in[stride * s + p],
+                                s);
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // anonymous namespace
+
+template <typename IndexT>
+IndexShardsTemplate<IndexT>::IndexShardsTemplate(
+        idx_t d,
+        bool threaded,
+        bool successive_ids)
+        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {}
+
+template <typename IndexT>
+IndexShardsTemplate<IndexT>::IndexShardsTemplate(
+        int d,
+        bool threaded,
+        bool successive_ids)
+        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {}
+
+template <typename IndexT>
+IndexShardsTemplate<IndexT>::IndexShardsTemplate(
+        bool threaded,
+        bool successive_ids)
+        : ThreadedIndex<IndexT>(threaded), successive_ids(successive_ids) {}
+
+template <typename IndexT>
+void IndexShardsTemplate<IndexT>::onAfterAddIndex(IndexT* index /* unused */) {
+    syncWithSubIndexes();
+}
+
+template <typename IndexT>
+void IndexShardsTemplate<IndexT>::onAfterRemoveIndex(
+        IndexT* index /* unused */) {
+    syncWithSubIndexes();
+}
+
+// FIXME: assumes that nothing is currently running on the sub-indexes, which is
+// true with the normal API, but should use the runOnIndex API instead
+template <typename IndexT>
+void IndexShardsTemplate<IndexT>::syncWithSubIndexes() {
+    if (!this->count()) {
+        this->is_trained = false;
+        this->ntotal = 0;
+
+        return;
+    }
+
+    auto firstIndex = this->at(0);
+    this->metric_type = firstIndex->metric_type;
+    this->is_trained = firstIndex->is_trained;
+    this->ntotal = firstIndex->ntotal;
+
+    for (int i = 1; i < this->count(); ++i) {
+        auto index = this->at(i);
+        FAISS_THROW_IF_NOT(this->metric_type == index->metric_type);
+        FAISS_THROW_IF_NOT(this->d == index->d);
+        FAISS_THROW_IF_NOT(this->is_trained == index->is_trained);
+
+        this->ntotal += index->ntotal;
+    }
+}
+
+// No metric_type for IndexBinary
+template <>
+void IndexShardsTemplate<IndexBinary>::syncWithSubIndexes() {
+    if (!this->count()) {
+        this->is_trained = false;
+        this->ntotal = 0;
+
+        return;
+    }
+
+    auto firstIndex = this->at(0);
+    this->is_trained = firstIndex->is_trained;
+    this->ntotal = firstIndex->ntotal;
+
+    for (int i = 1; i < this->count(); ++i) {
+        auto index = this->at(i);
+        FAISS_THROW_IF_NOT(this->d == index->d);
+        FAISS_THROW_IF_NOT(this->is_trained == index->is_trained);
+
+        this->ntotal += index->ntotal;
+    }
+}
+
+template <typename IndexT>
+void IndexShardsTemplate<IndexT>::train(idx_t n, const component_t* x) {
+    auto fn = [n, x](int no, IndexT* index) {
+        if (index->verbose) {
+            printf("begin train shard %d on %" PRId64 " points\n", no, n);
+        }
+
+        index->train(n, x);
+
+        if (index->verbose) {
+            printf("end train shard %d\n", no);
+        }
+    };
+
+    this->runOnIndex(fn);
+    syncWithSubIndexes();
+}
+
+template <typename IndexT>
+void IndexShardsTemplate<IndexT>::add(idx_t n, const component_t* x) {
+    add_with_ids(n, x, nullptr);
+}
+
+template <typename IndexT>
+void IndexShardsTemplate<IndexT>::add_with_ids(
+        idx_t n,
+        const component_t* x,
+        const idx_t* xids) {
+    FAISS_THROW_IF_NOT_MSG(
+            !(successive_ids && xids),
+            "It makes no sense to pass in ids and "
+            "request them to be shifted");
+
+    if (successive_ids) {
+        FAISS_THROW_IF_NOT_MSG(
+                !xids,
+                "It makes no sense to pass in ids and "
+                "request them to be shifted");
+        FAISS_THROW_IF_NOT_MSG(
+                this->ntotal == 0,
+                "when adding to IndexShards with sucessive_ids, "
+                "only add() in a single pass is supported");
+    }
+
+    idx_t nshard = this->count();
+    const idx_t* ids = xids;
+
+    std::vector<idx_t> aids;
+
+    if (!ids && !successive_ids) {
+        aids.resize(n);
+
+        for (idx_t i = 0; i < n; i++) {
+            aids[i] = this->ntotal + i;
+        }
+
+        ids = aids.data();
+    }
+
+    size_t components_per_vec =
+            sizeof(component_t) == 1 ? (this->d + 7) / 8 : this->d;
+
+    auto fn = [n, ids, x, nshard, components_per_vec](int no, IndexT* index) {
+        idx_t i0 = (idx_t)no * n / nshard;
+        idx_t i1 = ((idx_t)no + 1) * n / nshard;
+        auto x0 = x + i0 * components_per_vec;
+
+        if (index->verbose) {
+            printf("begin add shard %d on %" PRId64 " points\n", no, n);
+        }
+
+        if (ids) {
+            index->add_with_ids(i1 - i0, x0, ids + i0);
+        } else {
+            index->add(i1 - i0, x0);
+        }
+
+        if (index->verbose) {
+            printf("end add shard %d on %" PRId64 " points\n", no, i1 - i0);
+        }
+    };
+
+    this->runOnIndex(fn);
+    syncWithSubIndexes();
+}
+
+template <typename IndexT>
+void IndexShardsTemplate<IndexT>::search(
+        idx_t n,
+        const component_t* x,
+        idx_t k,
+        distance_t* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+
+    long nshard = this->count();
+
+    std::vector<distance_t> all_distances(nshard * k * n);
+    std::vector<idx_t> all_labels(nshard * k * n);
+
+    auto fn = [n, k, x, &all_distances, &all_labels](
+                      int no, const IndexT* index) {
+        if (index->verbose) {
+            printf("begin query shard %d on %" PRId64 " points\n", no, n);
+        }
+
+        index->search(
+                n,
+                x,
+                k,
+                all_distances.data() + no * k * n,
+                all_labels.data() + no * k * n);
+
+        if (index->verbose) {
+            printf("end query shard %d\n", no);
+        }
+    };
+
+    this->runOnIndex(fn);
+
+    std::vector<long> translations(nshard, 0);
+
+    // Because we just called runOnIndex above, it is safe to access the
+    // sub-index ntotal here
+    if (successive_ids) {
+        translations[0] = 0;
+
+        for (int s = 0; s + 1 < nshard; s++) {
+            translations[s + 1] = translations[s] + this->at(s)->ntotal;
+        }
+    }
+
+    if (this->metric_type == METRIC_L2) {
+        merge_tables<IndexT, CMin<distance_t, int>>(
+                n,
+                k,
+                nshard,
+                distances,
+                labels,
+                all_distances,
+                all_labels,
+                translations);
+    } else {
+        merge_tables<IndexT, CMax<distance_t, int>>(
+                n,
+                k,
+                nshard,
+                distances,
+                labels,
+                all_distances,
+                all_labels,
+                translations);
+    }
+}
+
+// explicit instanciations
+template struct IndexShardsTemplate<Index>;
+template struct IndexShardsTemplate<IndexBinary>;
+
+} // namespace faiss
--- a/faiss/IndexShards.h
+++ b/faiss/IndexShards.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/impl/ThreadedIndex.h>
+
+namespace faiss {
+
+/**
+ * Index that concatenates the results from several sub-indexes
+ */
+template <typename IndexT>
+struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    /**
+     * The dimension that all sub-indices must share will be the dimension of
+     * the first sub-index added
+     *
+     * @param threaded     do we use one thread per sub_index or do
+     *                     queries sequentially?
+     * @param successive_ids should we shift the returned ids by
+     *                     the size of each sub-index or return them
+     *                     as they are?
+     */
+    explicit IndexShardsTemplate(
+            bool threaded = false,
+            bool successive_ids = true);
+
+    /**
+     * @param threaded     do we use one thread per sub_index or do
+     *                     queries sequentially?
+     * @param successive_ids should we shift the returned ids by
+     *                     the size of each sub-index or return them
+     *                     as they are?
+     */
+    explicit IndexShardsTemplate(
+            idx_t d,
+            bool threaded = false,
+            bool successive_ids = true);
+
+    /// int version due to the implicit bool conversion ambiguity of int as
+    /// dimension
+    explicit IndexShardsTemplate(
+            int d,
+            bool threaded = false,
+            bool successive_ids = true);
+
+    /// Alias for addIndex()
+    void add_shard(IndexT* index) {
+        this->addIndex(index);
+    }
+
+    /// Alias for removeIndex()
+    void remove_shard(IndexT* index) {
+        this->removeIndex(index);
+    }
+
+    /// supported only for sub-indices that implement add_with_ids
+    void add(idx_t n, const component_t* x) override;
+
+    /**
+     * Cases (successive_ids, xids):
+     * - true, non-NULL       ERROR: it makes no sense to pass in ids and
+     *                        request them to be shifted
+     * - true, NULL           OK, but should be called only once (calls add()
+     *                        on sub-indexes).
+     * - false, non-NULL      OK: will call add_with_ids with passed in xids
+     *                        distributed evenly over shards
+     * - false, NULL          OK: will call add_with_ids on each sub-index,
+     *                        starting at ntotal
+     */
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+
+    void search(
+            idx_t n,
+            const component_t* x,
+            idx_t k,
+            distance_t* distances,
+            idx_t* labels) const override;
+
+    void train(idx_t n, const component_t* x) override;
+
+    bool successive_ids;
+
+    /// Synchronize the top-level index (IndexShards) with data in the
+    /// sub-indices
+    void syncWithSubIndexes();
+
+   protected:
+    /// Called just after an index is added
+    void onAfterAddIndex(IndexT* index) override;
+
+    /// Called just after an index is removed
+    void onAfterRemoveIndex(IndexT* index) override;
+};
+
+using IndexShards = IndexShardsTemplate<Index>;
+using IndexBinaryShards = IndexShardsTemplate<IndexBinary>;
+
+} // namespace faiss
--- a/faiss/MatrixStats.cpp
+++ b/faiss/MatrixStats.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/MatrixStats.h>
+
+#include <stdarg.h> /* va_list, va_start, va_arg, va_end */
+
+#include <faiss/utils/utils.h>
+#include <cmath>
+#include <cstdio>
+
+namespace faiss {
+
+/*********************************************************************
+ * MatrixStats
+ *********************************************************************/
+
+MatrixStats::PerDimStats::PerDimStats()
+        : n(0),
+          n_nan(0),
+          n_inf(0),
+          n0(0),
+          min(HUGE_VALF),
+          max(-HUGE_VALF),
+          sum(0),
+          sum2(0),
+          mean(NAN),
+          stddev(NAN) {}
+
+void MatrixStats::PerDimStats::add(float x) {
+    n++;
+    if (std::isnan(x)) {
+        n_nan++;
+        return;
+    }
+    if (!std::isfinite(x)) {
+        n_inf++;
+        return;
+    }
+    if (x == 0)
+        n0++;
+    if (x < min)
+        min = x;
+    if (x > max)
+        max = x;
+    sum += x;
+    sum2 += (double)x * (double)x;
+}
+
+void MatrixStats::PerDimStats::compute_mean_std() {
+    n_valid = n - n_nan - n_inf;
+    mean = sum / n_valid;
+    double var = sum2 / n_valid - mean * mean;
+    if (var < 0)
+        var = 0;
+    stddev = sqrt(var);
+}
+
+void MatrixStats::do_comment(const char* fmt, ...) {
+    va_list ap;
+
+    /* Determine required size */
+    va_start(ap, fmt);
+    size_t size = vsnprintf(buf, nbuf, fmt, ap);
+    va_end(ap);
+
+    nbuf -= size;
+    buf += size;
+}
+
+MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
+        : n(n),
+          d(d),
+          n_collision(0),
+          n_valid(0),
+          n0(0),
+          min_norm2(HUGE_VAL),
+          max_norm2(0) {
+    std::vector<char> comment_buf(10000);
+    buf = comment_buf.data();
+    nbuf = comment_buf.size();
+
+    do_comment("analyzing %ld vectors of size %ld\n", n, d);
+
+    if (d > 1024) {
+        do_comment(
+                "indexing this many dimensions is hard, "
+                "please consider dimensionality reducution (with PCAMatrix)\n");
+    }
+
+    size_t nbytes = sizeof(x[0]) * d;
+    per_dim_stats.resize(d);
+
+    for (size_t i = 0; i < n; i++) {
+        const float* xi = x + d * i;
+        double sum2 = 0;
+        for (size_t j = 0; j < d; j++) {
+            per_dim_stats[j].add(xi[j]);
+            sum2 += xi[j] * (double)xi[j];
+        }
+
+        if (std::isfinite(sum2)) {
+            n_valid++;
+            if (sum2 == 0) {
+                n0++;
+            } else {
+                if (sum2 < min_norm2)
+                    min_norm2 = sum2;
+                if (sum2 > max_norm2)
+                    max_norm2 = sum2;
+            }
+        }
+
+        { // check hash
+            uint64_t hash = hash_bytes((const uint8_t*)xi, nbytes);
+            auto elt = occurrences.find(hash);
+            if (elt == occurrences.end()) {
+                Occurrence occ = {i, 1};
+                occurrences[hash] = occ;
+            } else {
+                if (!memcmp(xi, x + elt->second.first * d, nbytes)) {
+                    elt->second.count++;
+                } else {
+                    n_collision++;
+                    // we should use a list of collisions but overkill
+                }
+            }
+        }
+    }
+
+    // invalid vecor stats
+    if (n_valid == n) {
+        do_comment("no NaN or Infs in data\n");
+    } else {
+        do_comment(
+                "%ld vectors contain NaN or Inf "
+                "(or have too large components), "
+                "expect bad results with indexing!\n",
+                n - n_valid);
+    }
+
+    // copies in dataset
+    if (occurrences.size() == n) {
+        do_comment("all vectors are distinct\n");
+    } else {
+        do_comment(
+                "%ld vectors are distinct (%.2f%%)\n",
+                occurrences.size(),
+                occurrences.size() * 100.0 / n);
+
+        if (n_collision > 0) {
+            do_comment(
+                    "%ld collisions in hash table, "
+                    "counts may be invalid\n",
+                    n_collision);
+        }
+
+        Occurrence max = {0, 0};
+        for (auto it = occurrences.begin(); it != occurrences.end(); ++it) {
+            if (it->second.count > max.count) {
+                max = it->second;
+            }
+        }
+        do_comment("vector %ld has %ld copies\n", max.first, max.count);
+    }
+
+    { // norm stats
+        min_norm2 = sqrt(min_norm2);
+        max_norm2 = sqrt(max_norm2);
+        do_comment(
+                "range of L2 norms=[%g, %g] (%ld null vectors)\n",
+                min_norm2,
+                max_norm2,
+                n0);
+
+        if (max_norm2 < min_norm2 * 1.0001) {
+            do_comment(
+                    "vectors are normalized, inner product and "
+                    "L2  search are equivalent\n");
+        }
+
+        if (max_norm2 > min_norm2 * 100) {
+            do_comment(
+                    "vectors have very large differences in norms, "
+                    "is this normal?\n");
+        }
+    }
+
+    { // per dimension stats
+
+        double max_std = 0, min_std = HUGE_VAL;
+
+        size_t n_dangerous_range = 0, n_0_range = 0, n0 = 0;
+
+        for (size_t j = 0; j < d; j++) {
+            PerDimStats& st = per_dim_stats[j];
+            st.compute_mean_std();
+            n0 += st.n0;
+
+            if (st.max == st.min) {
+                n_0_range++;
+            } else if (st.max < 1.001 * st.min) {
+                n_dangerous_range++;
+            }
+
+            if (st.stddev > max_std)
+                max_std = st.stddev;
+            if (st.stddev < min_std)
+                min_std = st.stddev;
+        }
+
+        if (n0 == 0) {
+            do_comment("matrix contains no 0s\n");
+        } else {
+            do_comment(
+                    "matrix contains %.2f %% 0 entries\n",
+                    n0 * 100.0 / (n * d));
+        }
+
+        if (n_0_range == 0) {
+            do_comment("no constant dimensions\n");
+        } else {
+            do_comment(
+                    "%ld dimensions are constant: they can be removed\n",
+                    n_0_range);
+        }
+
+        if (n_dangerous_range == 0) {
+            do_comment("no dimension has a too large mean\n");
+        } else {
+            do_comment(
+                    "%ld dimensions are too large "
+                    "wrt. their variance, may loose precision "
+                    "in IndexFlatL2 (use CenteringTransform)\n",
+                    n_dangerous_range);
+        }
+
+        do_comment("stddevs per dimension are in [%g %g]\n", min_std, max_std);
+
+        size_t n_small_var = 0;
+
+        for (size_t j = 0; j < d; j++) {
+            const PerDimStats& st = per_dim_stats[j];
+            if (st.stddev < max_std * 1e-4) {
+                n_small_var++;
+            }
+        }
+
+        if (n_small_var > 0) {
+            do_comment(
+                    "%ld dimensions have negligible stddev wrt. "
+                    "the largest dimension, they could be ignored",
+                    n_small_var);
+        }
+    }
+    comments = comment_buf.data();
+    buf = nullptr;
+    nbuf = 0;
+}
+
+} // namespace faiss
--- a/faiss/MatrixStats.h
+++ b/faiss/MatrixStats.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#pragma once
+
+#include <stdint.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace faiss {
+
+/** Reports some statistics on a dataset and comments on them.
+ *
+ * It is a class rather than a function so that all stats can also be
+ * accessed from code */
+
+struct MatrixStats {
+    MatrixStats(size_t n, size_t d, const float* x);
+    std::string comments;
+
+    // raw statistics
+    size_t n, d;
+    size_t n_collision, n_valid, n0;
+    double min_norm2, max_norm2;
+
+    struct PerDimStats {
+        size_t n, n_nan, n_inf, n0;
+
+        float min, max;
+        double sum, sum2;
+
+        size_t n_valid;
+        double mean, stddev;
+
+        PerDimStats();
+        void add(float x);
+        void compute_mean_std();
+    };
+
+    std::vector<PerDimStats> per_dim_stats;
+    struct Occurrence {
+        size_t first;
+        size_t count;
+    };
+    std::unordered_map<uint64_t, Occurrence> occurrences;
+
+    char* buf;
+    size_t nbuf;
+    void do_comment(const char* fmt, ...);
+};
+
+} // namespace faiss
--- a/faiss/MetaIndexes.cpp
+++ b/faiss/MetaIndexes.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/MetaIndexes.h>
+
+#include <stdint.h>
+#include <cinttypes>
+#include <cstdio>
+#include <limits>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/WorkerThread.h>
+
+namespace faiss {
+
+namespace {} // namespace
+
+/*****************************************************
+ * IndexIDMap implementation
+ *******************************************************/
+
+template <typename IndexT>
+IndexIDMapTemplate<IndexT>::IndexIDMapTemplate(IndexT* index)
+        : index(index), own_fields(false) {
+    FAISS_THROW_IF_NOT_MSG(index->ntotal == 0, "index must be empty on input");
+    this->is_trained = index->is_trained;
+    this->metric_type = index->metric_type;
+    this->verbose = index->verbose;
+    this->d = index->d;
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::add(
+        idx_t,
+        const typename IndexT::component_t*) {
+    FAISS_THROW_MSG(
+            "add does not make sense with IndexIDMap, "
+            "use add_with_ids");
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::train(
+        idx_t n,
+        const typename IndexT::component_t* x) {
+    index->train(n, x);
+    this->is_trained = index->is_trained;
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::reset() {
+    index->reset();
+    id_map.clear();
+    this->ntotal = 0;
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::add_with_ids(
+        idx_t n,
+        const typename IndexT::component_t* x,
+        const typename IndexT::idx_t* xids) {
+    index->add(n, x);
+    for (idx_t i = 0; i < n; i++)
+        id_map.push_back(xids[i]);
+    this->ntotal = index->ntotal;
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::search(
+        idx_t n,
+        const typename IndexT::component_t* x,
+        idx_t k,
+        typename IndexT::distance_t* distances,
+        typename IndexT::idx_t* labels) const {
+    index->search(n, x, k, distances, labels);
+    idx_t* li = labels;
+#pragma omp parallel for
+    for (idx_t i = 0; i < n * k; i++) {
+        li[i] = li[i] < 0 ? li[i] : id_map[li[i]];
+    }
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::range_search(
+        typename IndexT::idx_t n,
+        const typename IndexT::component_t* x,
+        typename IndexT::distance_t radius,
+        RangeSearchResult* result) const {
+    index->range_search(n, x, radius, result);
+#pragma omp parallel for
+    for (idx_t i = 0; i < result->lims[result->nq]; i++) {
+        result->labels[i] = result->labels[i] < 0 ? result->labels[i]
+                                                  : id_map[result->labels[i]];
+    }
+}
+
+namespace {
+
+struct IDTranslatedSelector : IDSelector {
+    const std::vector<int64_t>& id_map;
+    const IDSelector& sel;
+    IDTranslatedSelector(
+            const std::vector<int64_t>& id_map,
+            const IDSelector& sel)
+            : id_map(id_map), sel(sel) {}
+    bool is_member(idx_t id) const override {
+        return sel.is_member(id_map[id]);
+    }
+};
+
+} // namespace
+
+template <typename IndexT>
+size_t IndexIDMapTemplate<IndexT>::remove_ids(const IDSelector& sel) {
+    // remove in sub-index first
+    IDTranslatedSelector sel2(id_map, sel);
+    size_t nremove = index->remove_ids(sel2);
+
+    int64_t j = 0;
+    for (idx_t i = 0; i < this->ntotal; i++) {
+        if (sel.is_member(id_map[i])) {
+            // remove
+        } else {
+            id_map[j] = id_map[i];
+            j++;
+        }
+    }
+    FAISS_ASSERT(j == index->ntotal);
+    this->ntotal = j;
+    id_map.resize(this->ntotal);
+    return nremove;
+}
+
+template <typename IndexT>
+IndexIDMapTemplate<IndexT>::~IndexIDMapTemplate() {
+    if (own_fields)
+        delete index;
+}
+
+/*****************************************************
+ * IndexIDMap2 implementation
+ *******************************************************/
+
+template <typename IndexT>
+IndexIDMap2Template<IndexT>::IndexIDMap2Template(IndexT* index)
+        : IndexIDMapTemplate<IndexT>(index) {}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::add_with_ids(
+        idx_t n,
+        const typename IndexT::component_t* x,
+        const typename IndexT::idx_t* xids) {
+    size_t prev_ntotal = this->ntotal;
+    IndexIDMapTemplate<IndexT>::add_with_ids(n, x, xids);
+    for (size_t i = prev_ntotal; i < this->ntotal; i++) {
+        rev_map[this->id_map[i]] = i;
+    }
+}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::construct_rev_map() {
+    rev_map.clear();
+    for (size_t i = 0; i < this->ntotal; i++) {
+        rev_map[this->id_map[i]] = i;
+    }
+}
+
+template <typename IndexT>
+size_t IndexIDMap2Template<IndexT>::remove_ids(const IDSelector& sel) {
+    // This is quite inefficient
+    size_t nremove = IndexIDMapTemplate<IndexT>::remove_ids(sel);
+    construct_rev_map();
+    return nremove;
+}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::reconstruct(
+        idx_t key,
+        typename IndexT::component_t* recons) const {
+    try {
+        this->index->reconstruct(rev_map.at(key), recons);
+    } catch (const std::out_of_range& e) {
+        FAISS_THROW_FMT("key %" PRId64 " not found", key);
+    }
+}
+
+// explicit template instantiations
+
+template struct IndexIDMapTemplate<Index>;
+template struct IndexIDMapTemplate<IndexBinary>;
+template struct IndexIDMap2Template<Index>;
+template struct IndexIDMap2Template<IndexBinary>;
+
+/*****************************************************
+ * IndexSplitVectors implementation
+ *******************************************************/
+
+IndexSplitVectors::IndexSplitVectors(idx_t d, bool threaded)
+        : Index(d), own_fields(false), threaded(threaded), sum_d(0) {}
+
+void IndexSplitVectors::add_sub_index(Index* index) {
+    sub_indexes.push_back(index);
+    sync_with_sub_indexes();
+}
+
+void IndexSplitVectors::sync_with_sub_indexes() {
+    if (sub_indexes.empty())
+        return;
+    Index* index0 = sub_indexes[0];
+    sum_d = index0->d;
+    metric_type = index0->metric_type;
+    is_trained = index0->is_trained;
+    ntotal = index0->ntotal;
+    for (int i = 1; i < sub_indexes.size(); i++) {
+        Index* index = sub_indexes[i];
+        FAISS_THROW_IF_NOT(metric_type == index->metric_type);
+        FAISS_THROW_IF_NOT(ntotal == index->ntotal);
+        sum_d += index->d;
+    }
+}
+
+void IndexSplitVectors::add(idx_t /*n*/, const float* /*x*/) {
+    FAISS_THROW_MSG("not implemented");
+}
+
+void IndexSplitVectors::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels) const {
+    FAISS_THROW_IF_NOT_MSG(k == 1, "search implemented only for k=1");
+    FAISS_THROW_IF_NOT_MSG(
+            sum_d == d, "not enough indexes compared to # dimensions");
+
+    int64_t nshard = sub_indexes.size();
+    float* all_distances = new float[nshard * k * n];
+    idx_t* all_labels = new idx_t[nshard * k * n];
+    ScopeDeleter<float> del(all_distances);
+    ScopeDeleter<idx_t> del2(all_labels);
+
+    auto query_func = [n,
+                       x,
+                       k,
+                       distances,
+                       labels,
+                       all_distances,
+                       all_labels,
+                       this](int no) {
+        const IndexSplitVectors* index = this;
+        float* distances1 = no == 0 ? distances : all_distances + no * k * n;
+        idx_t* labels1 = no == 0 ? labels : all_labels + no * k * n;
+        if (index->verbose)
+            printf("begin query shard %d on %" PRId64 " points\n", no, n);
+        const Index* sub_index = index->sub_indexes[no];
+        int64_t sub_d = sub_index->d, d = index->d;
+        idx_t ofs = 0;
+        for (int i = 0; i < no; i++)
+            ofs += index->sub_indexes[i]->d;
+        float* sub_x = new float[sub_d * n];
+        ScopeDeleter<float> del1(sub_x);
+        for (idx_t i = 0; i < n; i++)
+            memcpy(sub_x + i * sub_d, x + ofs + i * d, sub_d * sizeof(sub_x));
+        sub_index->search(n, sub_x, k, distances1, labels1);
+        if (index->verbose)
+            printf("end query shard %d\n", no);
+    };
+
+    if (!threaded) {
+        for (int i = 0; i < nshard; i++) {
+            query_func(i);
+        }
+    } else {
+        std::vector<std::unique_ptr<WorkerThread>> threads;
+        std::vector<std::future<bool>> v;
+
+        for (int i = 0; i < nshard; i++) {
+            threads.emplace_back(new WorkerThread());
+            WorkerThread* wt = threads.back().get();
+            v.emplace_back(wt->add([i, query_func]() { query_func(i); }));
+        }
+
+        // Blocking wait for completion
+        for (auto& func : v) {
+            func.get();
+        }
+    }
+
+    int64_t factor = 1;
+    for (int i = 0; i < nshard; i++) {
+        if (i > 0) { // results of 0 are already in the table
+            const float* distances_i = all_distances + i * k * n;
+            const idx_t* labels_i = all_labels + i * k * n;
+            for (int64_t j = 0; j < n; j++) {
+                if (labels[j] >= 0 && labels_i[j] >= 0) {
+                    labels[j] += labels_i[j] * factor;
+                    distances[j] += distances_i[j];
+                } else {
+                    labels[j] = -1;
+                    distances[j] = std::numeric_limits<float>::quiet_NaN();
+                }
+            }
+        }
+        factor *= sub_indexes[i]->ntotal;
+    }
+}
+
+void IndexSplitVectors::train(idx_t /*n*/, const float* /*x*/) {
+    FAISS_THROW_MSG("not implemented");
+}
+
+void IndexSplitVectors::reset() {
+    FAISS_THROW_MSG("not implemented");
+}
+
+IndexSplitVectors::~IndexSplitVectors() {
+    if (own_fields) {
+        for (int s = 0; s < sub_indexes.size(); s++)
+            delete sub_indexes[s];
+    }
+}
+
+} // namespace faiss
--- a/faiss/MetaIndexes.h
+++ b/faiss/MetaIndexes.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef META_INDEXES_H
+#define META_INDEXES_H
+
+#include <faiss/Index.h>
+#include <faiss/IndexReplicas.h>
+#include <faiss/IndexShards.h>
+#include <unordered_map>
+#include <vector>
+
+namespace faiss {
+
+/** Index that translates search results to ids */
+template <typename IndexT>
+struct IndexIDMapTemplate : IndexT {
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    IndexT* index;   ///! the sub-index
+    bool own_fields; ///! whether pointers are deleted in destructo
+    std::vector<idx_t> id_map;
+
+    explicit IndexIDMapTemplate(IndexT* index);
+
+    /// @param xids if non-null, ids to store for the vectors (size n)
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+
+    /// this will fail. Use add_with_ids
+    void add(idx_t n, const component_t* x) override;
+
+    void search(
+            idx_t n,
+            const component_t* x,
+            idx_t k,
+            distance_t* distances,
+            idx_t* labels) const override;
+
+    void train(idx_t n, const component_t* x) override;
+
+    void reset() override;
+
+    /// remove ids adapted to IndexFlat
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void range_search(
+            idx_t n,
+            const component_t* x,
+            distance_t radius,
+            RangeSearchResult* result) const override;
+
+    ~IndexIDMapTemplate() override;
+    IndexIDMapTemplate() {
+        own_fields = false;
+        index = nullptr;
+    }
+};
+
+using IndexIDMap = IndexIDMapTemplate<Index>;
+using IndexBinaryIDMap = IndexIDMapTemplate<IndexBinary>;
+
+/** same as IndexIDMap but also provides an efficient reconstruction
+ *  implementation via a 2-way index */
+template <typename IndexT>
+struct IndexIDMap2Template : IndexIDMapTemplate<IndexT> {
+    using idx_t = typename IndexT::idx_t;
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    std::unordered_map<idx_t, idx_t> rev_map;
+
+    explicit IndexIDMap2Template(IndexT* index);
+
+    /// make the rev_map from scratch
+    void construct_rev_map();
+
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void reconstruct(idx_t key, component_t* recons) const override;
+
+    ~IndexIDMap2Template() override {}
+    IndexIDMap2Template() {}
+};
+
+using IndexIDMap2 = IndexIDMap2Template<Index>;
+using IndexBinaryIDMap2 = IndexIDMap2Template<IndexBinary>;
+
+/** splits input vectors in segments and assigns each segment to a sub-index
+ * used to distribute a MultiIndexQuantizer
+ */
+struct IndexSplitVectors : Index {
+    bool own_fields;
+    bool threaded;
+    std::vector<Index*> sub_indexes;
+    idx_t sum_d; /// sum of dimensions seen so far
+
+    explicit IndexSplitVectors(idx_t d, bool threaded = false);
+
+    void add_sub_index(Index*);
+    void sync_with_sub_indexes();
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels) const override;
+
+    void train(idx_t n, const float* x) override;
+
+    void reset() override;
+
+    ~IndexSplitVectors() override;
+};
+
+} // namespace faiss
+
+#endif
--- a/faiss/MetricType.h
+++ b/faiss/MetricType.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_METRIC_TYPE_H
+#define FAISS_METRIC_TYPE_H
+
+namespace faiss {
+
+/// The metric space for vector comparison for Faiss indices and algorithms.
+///
+/// Most algorithms support both inner product and L2, with the flat
+/// (brute-force) indices supporting additional metric types for vector
+/// comparison.
+enum MetricType {
+    METRIC_INNER_PRODUCT = 0, ///< maximum inner product search
+    METRIC_L2 = 1,            ///< squared L2 search
+    METRIC_L1,                ///< L1 (aka cityblock)
+    METRIC_Linf,              ///< infinity distance
+    METRIC_Lp,                ///< L_p distance, p is given by a faiss::Index
+                              /// metric_arg
+
+    /// some additional metrics defined in scipy.spatial.distance
+    METRIC_Canberra = 20,
+    METRIC_BrayCurtis,
+    METRIC_JensenShannon,
+};
+
+} // namespace faiss
+
+#endif
--- a/faiss/VectorTransform.cpp
+++ b/faiss/VectorTransform.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/VectorTransform.h>
+
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+
+#include <faiss/IndexPQ.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+
+using namespace faiss;
+
+extern "C" {
+
+// this is to keep the clang syntax checker happy
+#ifndef FINTEGER
+#define FINTEGER int
+#endif
+
+/* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
+
+int sgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const float* alpha,
+        const float* a,
+        FINTEGER* lda,
+        const float* b,
+        FINTEGER* ldb,
+        float* beta,
+        float* c,
+        FINTEGER* ldc);
+
+int dgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const double* alpha,
+        const double* a,
+        FINTEGER* lda,
+        const double* b,
+        FINTEGER* ldb,
+        double* beta,
+        double* c,
+        FINTEGER* ldc);
+
+int ssyrk_(
+        const char* uplo,
+        const char* trans,
+        FINTEGER* n,
+        FINTEGER* k,
+        float* alpha,
+        float* a,
+        FINTEGER* lda,
+        float* beta,
+        float* c,
+        FINTEGER* ldc);
+
+/* Lapack functions from http://www.netlib.org/clapack/old/single/ */
+
+int ssyev_(
+        const char* jobz,
+        const char* uplo,
+        FINTEGER* n,
+        float* a,
+        FINTEGER* lda,
+        float* w,
+        float* work,
+        FINTEGER* lwork,
+        FINTEGER* info);
+
+int dsyev_(
+        const char* jobz,
+        const char* uplo,
+        FINTEGER* n,
+        double* a,
+        FINTEGER* lda,
+        double* w,
+        double* work,
+        FINTEGER* lwork,
+        FINTEGER* info);
+
+int sgesvd_(
+        const char* jobu,
+        const char* jobvt,
+        FINTEGER* m,
+        FINTEGER* n,
+        float* a,
+        FINTEGER* lda,
+        float* s,
+        float* u,
+        FINTEGER* ldu,
+        float* vt,
+        FINTEGER* ldvt,
+        float* work,
+        FINTEGER* lwork,
+        FINTEGER* info);
+
+int dgesvd_(
+        const char* jobu,
+        const char* jobvt,
+        FINTEGER* m,
+        FINTEGER* n,
+        double* a,
+        FINTEGER* lda,
+        double* s,
+        double* u,
+        FINTEGER* ldu,
+        double* vt,
+        FINTEGER* ldvt,
+        double* work,
+        FINTEGER* lwork,
+        FINTEGER* info);
+}
+
+/*********************************************
+ * VectorTransform
+ *********************************************/
+
+float* VectorTransform::apply(Index::idx_t n, const float* x) const {
+    float* xt = new float[n * d_out];
+    apply_noalloc(n, x, xt);
+    return xt;
+}
+
+void VectorTransform::train(idx_t, const float*) {
+    // does nothing by default
+}
+
+void VectorTransform::reverse_transform(idx_t, const float*, float*) const {
+    FAISS_THROW_MSG("reverse transform not implemented");
+}
+
+/*********************************************
+ * LinearTransform
+ *********************************************/
+
+/// both d_in > d_out and d_out < d_in are supported
+LinearTransform::LinearTransform(int d_in, int d_out, bool have_bias)
+        : VectorTransform(d_in, d_out),
+          have_bias(have_bias),
+          is_orthonormal(false),
+          verbose(false) {
+    is_trained = false; // will be trained when A and b are initialized
+}
+
+void LinearTransform::apply_noalloc(Index::idx_t n, const float* x, float* xt)
+        const {
+    FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
+
+    float c_factor;
+    if (have_bias) {
+        FAISS_THROW_IF_NOT_MSG(b.size() == d_out, "Bias not initialized");
+        float* xi = xt;
+        for (int i = 0; i < n; i++)
+            for (int j = 0; j < d_out; j++)
+                *xi++ = b[j];
+        c_factor = 1.0;
+    } else {
+        c_factor = 0.0;
+    }
+
+    FAISS_THROW_IF_NOT_MSG(
+            A.size() == d_out * d_in, "Transformation matrix not initialized");
+
+    float one = 1;
+    FINTEGER nbiti = d_out, ni = n, di = d_in;
+    sgemm_("Transposed",
+           "Not transposed",
+           &nbiti,
+           &ni,
+           &di,
+           &one,
+           A.data(),
+           &di,
+           x,
+           &di,
+           &c_factor,
+           xt,
+           &nbiti);
+}
+
+void LinearTransform::transform_transpose(idx_t n, const float* y, float* x)
+        const {
+    if (have_bias) { // allocate buffer to store bias-corrected data
+        float* y_new = new float[n * d_out];
+        const float* yr = y;
+        float* yw = y_new;
+        for (idx_t i = 0; i < n; i++) {
+            for (int j = 0; j < d_out; j++) {
+                *yw++ = *yr++ - b[j];
+            }
+        }
+        y = y_new;
+    }
+
+    {
+        FINTEGER dii = d_in, doi = d_out, ni = n;
+        float one = 1.0, zero = 0.0;
+        sgemm_("Not",
+               "Not",
+               &dii,
+               &ni,
+               &doi,
+               &one,
+               A.data(),
+               &dii,
+               y,
+               &doi,
+               &zero,
+               x,
+               &dii);
+    }
+
+    if (have_bias)
+        delete[] y;
+}
+
+void LinearTransform::set_is_orthonormal() {
+    if (d_out > d_in) {
+        // not clear what we should do in this case
+        is_orthonormal = false;
+        return;
+    }
+    if (d_out == 0) { // borderline case, unnormalized matrix
+        is_orthonormal = true;
+        return;
+    }
+
+    double eps = 4e-5;
+    FAISS_ASSERT(A.size() >= d_out * d_in);
+    {
+        std::vector<float> ATA(d_out * d_out);
+        FINTEGER dii = d_in, doi = d_out;
+        float one = 1.0, zero = 0.0;
+
+        sgemm_("Transposed",
+               "Not",
+               &doi,
+               &doi,
+               &dii,
+               &one,
+               A.data(),
+               &dii,
+               A.data(),
+               &dii,
+               &zero,
+               ATA.data(),
+               &doi);
+
+        is_orthonormal = true;
+        for (long i = 0; i < d_out; i++) {
+            for (long j = 0; j < d_out; j++) {
+                float v = ATA[i + j * d_out];
+                if (i == j)
+                    v -= 1;
+                if (fabs(v) > eps) {
+                    is_orthonormal = false;
+                }
+            }
+        }
+    }
+}
+
+void LinearTransform::reverse_transform(idx_t n, const float* xt, float* x)
+        const {
+    if (is_orthonormal) {
+        transform_transpose(n, xt, x);
+    } else {
+        FAISS_THROW_MSG(
+                "reverse transform not implemented for non-orthonormal matrices");
+    }
+}
+
+void LinearTransform::print_if_verbose(
+        const char* name,
+        const std::vector<double>& mat,
+        int n,
+        int d) const {
+    if (!verbose)
+        return;
+    printf("matrix %s: %d*%d [\n", name, n, d);
+    FAISS_THROW_IF_NOT(mat.size() >= n * d);
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < d; j++) {
+            printf("%10.5g ", mat[i * d + j]);
+        }
+        printf("\n");
+    }
+    printf("]\n");
+}
+
+/*********************************************
+ * RandomRotationMatrix
+ *********************************************/
+
+void RandomRotationMatrix::init(int seed) {
+    if (d_out <= d_in) {
+        A.resize(d_out * d_in);
+        float* q = A.data();
+        float_randn(q, d_out * d_in, seed);
+        matrix_qr(d_in, d_out, q);
+    } else {
+        // use tight-frame transformation
+        A.resize(d_out * d_out);
+        float* q = A.data();
+        float_randn(q, d_out * d_out, seed);
+        matrix_qr(d_out, d_out, q);
+        // remove columns
+        int i, j;
+        for (i = 0; i < d_out; i++) {
+            for (j = 0; j < d_in; j++) {
+                q[i * d_in + j] = q[i * d_out + j];
+            }
+        }
+        A.resize(d_in * d_out);
+    }
+    is_orthonormal = true;
+    is_trained = true;
+}
+
+void RandomRotationMatrix::train(Index::idx_t /*n*/, const float* /*x*/) {
+    // initialize with some arbitrary seed
+    init(12345);
+}
+
+/*********************************************
+ * PCAMatrix
+ *********************************************/
+
+PCAMatrix::PCAMatrix(
+        int d_in,
+        int d_out,
+        float eigen_power,
+        bool random_rotation)
+        : LinearTransform(d_in, d_out, true),
+          eigen_power(eigen_power),
+          random_rotation(random_rotation) {
+    is_trained = false;
+    max_points_per_d = 1000;
+    balanced_bins = 0;
+    epsilon = 0;
+}
+
+namespace {
+
+/// Compute the eigenvalue decomposition of symmetric matrix cov,
+/// dimensions d_in-by-d_in. Output eigenvectors in cov.
+
+void eig(size_t d_in, double* cov, double* eigenvalues, int verbose) {
+    { // compute eigenvalues and vectors
+        FINTEGER info = 0, lwork = -1, di = d_in;
+        double workq;
+
+        dsyev_("Vectors as well",
+               "Upper",
+               &di,
+               cov,
+               &di,
+               eigenvalues,
+               &workq,
+               &lwork,
+               &info);
+        lwork = FINTEGER(workq);
+        double* work = new double[lwork];
+
+        dsyev_("Vectors as well",
+               "Upper",
+               &di,
+               cov,
+               &di,
+               eigenvalues,
+               work,
+               &lwork,
+               &info);
+
+        delete[] work;
+
+        if (info != 0) {
+            fprintf(stderr,
+                    "WARN ssyev info returns %d, "
+                    "a very bad PCA matrix is learnt\n",
+                    int(info));
+            // do not throw exception, as the matrix could still be useful
+        }
+
+        if (verbose && d_in <= 10) {
+            printf("info=%ld new eigvals=[", long(info));
+            for (int j = 0; j < d_in; j++)
+                printf("%g ", eigenvalues[j]);
+            printf("]\n");
+
+            double* ci = cov;
+            printf("eigenvecs=\n");
+            for (int i = 0; i < d_in; i++) {
+                for (int j = 0; j < d_in; j++)
+                    printf("%10.4g ", *ci++);
+                printf("\n");
+            }
+        }
+    }
+
+    // revert order of eigenvectors & values
+
+    for (int i = 0; i < d_in / 2; i++) {
+        std::swap(eigenvalues[i], eigenvalues[d_in - 1 - i]);
+        double* v1 = cov + i * d_in;
+        double* v2 = cov + (d_in - 1 - i) * d_in;
+        for (int j = 0; j < d_in; j++)
+            std::swap(v1[j], v2[j]);
+    }
+}
+
+} // namespace
+
+void PCAMatrix::train(Index::idx_t n, const float* x) {
+    const float* x_in = x;
+
+    x = fvecs_maybe_subsample(
+            d_in, (size_t*)&n, max_points_per_d * d_in, x, verbose);
+
+    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+
+    // compute mean
+    mean.clear();
+    mean.resize(d_in, 0.0);
+    if (have_bias) { // we may want to skip the bias
+        const float* xi = x;
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < d_in; j++)
+                mean[j] += *xi++;
+        }
+        for (int j = 0; j < d_in; j++)
+            mean[j] /= n;
+    }
+    if (verbose) {
+        printf("mean=[");
+        for (int j = 0; j < d_in; j++)
+            printf("%g ", mean[j]);
+        printf("]\n");
+    }
+
+    if (n >= d_in) {
+        // compute covariance matrix, store it in PCA matrix
+        PCAMat.resize(d_in * d_in);
+        float* cov = PCAMat.data();
+        { // initialize with  mean * mean^T term
+            float* ci = cov;
+            for (int i = 0; i < d_in; i++) {
+                for (int j = 0; j < d_in; j++)
+                    *ci++ = -n * mean[i] * mean[j];
+            }
+        }
+        {
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0;
+            ssyrk_("Up",
+                   "Non transposed",
+                   &di,
+                   &ni,
+                   &one,
+                   (float*)x,
+                   &di,
+                   &one,
+                   cov,
+                   &di);
+        }
+        if (verbose && d_in <= 10) {
+            float* ci = cov;
+            printf("cov=\n");
+            for (int i = 0; i < d_in; i++) {
+                for (int j = 0; j < d_in; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+
+        std::vector<double> covd(d_in * d_in);
+        for (size_t i = 0; i < d_in * d_in; i++)
+            covd[i] = cov[i];
+
+        std::vector<double> eigenvaluesd(d_in);
+
+        eig(d_in, covd.data(), eigenvaluesd.data(), verbose);
+
+        for (size_t i = 0; i < d_in * d_in; i++)
+            PCAMat[i] = covd[i];
+        eigenvalues.resize(d_in);
+
+        for (size_t i = 0; i < d_in; i++)
+            eigenvalues[i] = eigenvaluesd[i];
+
+    } else {
+        std::vector<float> xc(n * d_in);
+
+        for (size_t i = 0; i < n; i++)
+            for (size_t j = 0; j < d_in; j++)
+                xc[i * d_in + j] = x[i * d_in + j] - mean[j];
+
+        // compute Gram matrix
+        std::vector<float> gram(n * n);
+        {
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0, zero = 0.0;
+            ssyrk_("Up",
+                   "Transposed",
+                   &ni,
+                   &di,
+                   &one,
+                   xc.data(),
+                   &di,
+                   &zero,
+                   gram.data(),
+                   &ni);
+        }
+
+        if (verbose && d_in <= 10) {
+            float* ci = gram.data();
+            printf("gram=\n");
+            for (int i = 0; i < n; i++) {
+                for (int j = 0; j < n; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+
+        std::vector<double> gramd(n * n);
+        for (size_t i = 0; i < n * n; i++)
+            gramd[i] = gram[i];
+
+        std::vector<double> eigenvaluesd(n);
+
+        // eig will fill in only the n first eigenvals
+
+        eig(n, gramd.data(), eigenvaluesd.data(), verbose);
+
+        PCAMat.resize(d_in * n);
+
+        for (size_t i = 0; i < n * n; i++)
+            gram[i] = gramd[i];
+
+        eigenvalues.resize(d_in);
+        // fill in only the n first ones
+        for (size_t i = 0; i < n; i++)
+            eigenvalues[i] = eigenvaluesd[i];
+
+        { // compute PCAMat = x' * v
+            FINTEGER di = d_in, ni = n;
+            float one = 1.0;
+
+            sgemm_("Non",
+                   "Non Trans",
+                   &di,
+                   &ni,
+                   &ni,
+                   &one,
+                   xc.data(),
+                   &di,
+                   gram.data(),
+                   &ni,
+                   &one,
+                   PCAMat.data(),
+                   &di);
+        }
+
+        if (verbose && d_in <= 10) {
+            float* ci = PCAMat.data();
+            printf("PCAMat=\n");
+            for (int i = 0; i < n; i++) {
+                for (int j = 0; j < d_in; j++)
+                    printf("%10g ", *ci++);
+                printf("\n");
+            }
+        }
+        fvec_renorm_L2(d_in, n, PCAMat.data());
+    }
+
+    prepare_Ab();
+    is_trained = true;
+}
+
+void PCAMatrix::copy_from(const PCAMatrix& other) {
+    FAISS_THROW_IF_NOT(other.is_trained);
+    mean = other.mean;
+    eigenvalues = other.eigenvalues;
+    PCAMat = other.PCAMat;
+    prepare_Ab();
+    is_trained = true;
+}
+
+void PCAMatrix::prepare_Ab() {
+    FAISS_THROW_IF_NOT_FMT(
+            d_out * d_in <= PCAMat.size(),
+            "PCA matrix cannot output %d dimensions from %d ",
+            d_out,
+            d_in);
+
+    if (!random_rotation) {
+        A = PCAMat;
+        A.resize(d_out * d_in); // strip off useless dimensions
+
+        // first scale the components
+        if (eigen_power != 0) {
+            float* ai = A.data();
+            for (int i = 0; i < d_out; i++) {
+                float factor = pow(eigenvalues[i] + epsilon, eigen_power);
+                for (int j = 0; j < d_in; j++)
+                    *ai++ *= factor;
+            }
+        }
+
+        if (balanced_bins != 0) {
+            FAISS_THROW_IF_NOT(d_out % balanced_bins == 0);
+            int dsub = d_out / balanced_bins;
+            std::vector<float> Ain;
+            std::swap(A, Ain);
+            A.resize(d_out * d_in);
+
+            std::vector<float> accu(balanced_bins);
+            std::vector<int> counter(balanced_bins);
+
+            // greedy assignment
+            for (int i = 0; i < d_out; i++) {
+                // find best bin
+                int best_j = -1;
+                float min_w = 1e30;
+                for (int j = 0; j < balanced_bins; j++) {
+                    if (counter[j] < dsub && accu[j] < min_w) {
+                        min_w = accu[j];
+                        best_j = j;
+                    }
+                }
+                int row_dst = best_j * dsub + counter[best_j];
+                accu[best_j] += eigenvalues[i];
+                counter[best_j]++;
+                memcpy(&A[row_dst * d_in], &Ain[i * d_in], d_in * sizeof(A[0]));
+            }
+
+            if (verbose) {
+                printf("  bin accu=[");
+                for (int i = 0; i < balanced_bins; i++)
+                    printf("%g ", accu[i]);
+                printf("]\n");
+            }
+        }
+
+    } else {
+        FAISS_THROW_IF_NOT_MSG(
+                balanced_bins == 0,
+                "both balancing bins and applying a random rotation "
+                "does not make sense");
+        RandomRotationMatrix rr(d_out, d_out);
+
+        rr.init(5);
+
+        // apply scaling on the rotation matrix (right multiplication)
+        if (eigen_power != 0) {
+            for (int i = 0; i < d_out; i++) {
+                float factor = pow(eigenvalues[i], eigen_power);
+                for (int j = 0; j < d_out; j++)
+                    rr.A[j * d_out + i] *= factor;
+            }
+        }
+
+        A.resize(d_in * d_out);
+        {
+            FINTEGER dii = d_in, doo = d_out;
+            float one = 1.0, zero = 0.0;
+
+            sgemm_("Not",
+                   "Not",
+                   &dii,
+                   &doo,
+                   &doo,
+                   &one,
+                   PCAMat.data(),
+                   &dii,
+                   rr.A.data(),
+                   &doo,
+                   &zero,
+                   A.data(),
+                   &dii);
+        }
+    }
+
+    b.clear();
+    b.resize(d_out);
+
+    for (int i = 0; i < d_out; i++) {
+        float accu = 0;
+        for (int j = 0; j < d_in; j++)
+            accu -= mean[j] * A[j + i * d_in];
+        b[i] = accu;
+    }
+
+    is_orthonormal = eigen_power == 0;
+}
+
+/*********************************************
+ * ITQMatrix
+ *********************************************/
+
+ITQMatrix::ITQMatrix(int d)
+        : LinearTransform(d, d, false), max_iter(50), seed(123) {}
+
+/** translated from fbcode/deeplearning/catalyzer/catalyzer/quantizers.py */
+void ITQMatrix::train(Index::idx_t n, const float* xf) {
+    size_t d = d_in;
+    std::vector<double> rotation(d * d);
+
+    if (init_rotation.size() == d * d) {
+        memcpy(rotation.data(),
+               init_rotation.data(),
+               d * d * sizeof(rotation[0]));
+    } else {
+        RandomRotationMatrix rrot(d, d);
+        rrot.init(seed);
+        for (size_t i = 0; i < d * d; i++) {
+            rotation[i] = rrot.A[i];
+        }
+    }
+
+    std::vector<double> x(n * d);
+
+    for (size_t i = 0; i < n * d; i++) {
+        x[i] = xf[i];
+    }
+
+    std::vector<double> rotated_x(n * d), cov_mat(d * d);
+    std::vector<double> u(d * d), vt(d * d), singvals(d);
+
+    for (int i = 0; i < max_iter; i++) {
+        print_if_verbose("rotation", rotation, d, d);
+        { // rotated_data = np.dot(training_data, rotation)
+            FINTEGER di = d, ni = n;
+            double one = 1, zero = 0;
+            dgemm_("N",
+                   "N",
+                   &di,
+                   &ni,
+                   &di,
+                   &one,
+                   rotation.data(),
+                   &di,
+                   x.data(),
+                   &di,
+                   &zero,
+                   rotated_x.data(),
+                   &di);
+        }
+        print_if_verbose("rotated_x", rotated_x, n, d);
+        // binarize
+        for (size_t j = 0; j < n * d; j++) {
+            rotated_x[j] = rotated_x[j] < 0 ? -1 : 1;
+        }
+        // covariance matrix
+        { // rotated_data = np.dot(training_data, rotation)
+            FINTEGER di = d, ni = n;
+            double one = 1, zero = 0;
+            dgemm_("N",
+                   "T",
+                   &di,
+                   &di,
+                   &ni,
+                   &one,
+                   rotated_x.data(),
+                   &di,
+                   x.data(),
+                   &di,
+                   &zero,
+                   cov_mat.data(),
+                   &di);
+        }
+        print_if_verbose("cov_mat", cov_mat, d, d);
+        // SVD
+        {
+            FINTEGER di = d;
+            FINTEGER lwork = -1, info;
+            double lwork1;
+
+            // workspace query
+            dgesvd_("A",
+                    "A",
+                    &di,
+                    &di,
+                    cov_mat.data(),
+                    &di,
+                    singvals.data(),
+                    u.data(),
+                    &di,
+                    vt.data(),
+                    &di,
+                    &lwork1,
+                    &lwork,
+                    &info);
+
+            FAISS_THROW_IF_NOT(info == 0);
+            lwork = size_t(lwork1);
+            std::vector<double> work(lwork);
+            dgesvd_("A",
+                    "A",
+                    &di,
+                    &di,
+                    cov_mat.data(),
+                    &di,
+                    singvals.data(),
+                    u.data(),
+                    &di,
+                    vt.data(),
+                    &di,
+                    work.data(),
+                    &lwork,
+                    &info);
+            FAISS_THROW_IF_NOT_FMT(info == 0, "sgesvd returned info=%d", info);
+        }
+        print_if_verbose("u", u, d, d);
+        print_if_verbose("vt", vt, d, d);
+        // update rotation
+        {
+            FINTEGER di = d;
+            double one = 1, zero = 0;
+            dgemm_("N",
+                   "T",
+                   &di,
+                   &di,
+                   &di,
+                   &one,
+                   u.data(),
+                   &di,
+                   vt.data(),
+                   &di,
+                   &zero,
+                   rotation.data(),
+                   &di);
+        }
+        print_if_verbose("final rot", rotation, d, d);
+    }
+    A.resize(d * d);
+    for (size_t i = 0; i < d; i++) {
+        for (size_t j = 0; j < d; j++) {
+            A[i + d * j] = rotation[j + d * i];
+        }
+    }
+    is_trained = true;
+}
+
+ITQTransform::ITQTransform(int d_in, int d_out, bool do_pca)
+        : VectorTransform(d_in, d_out),
+          do_pca(do_pca),
+          itq(d_out),
+          pca_then_itq(d_in, d_out, false) {
+    if (!do_pca) {
+        FAISS_THROW_IF_NOT(d_in == d_out);
+    }
+    max_train_per_dim = 10;
+    is_trained = false;
+}
+
+void ITQTransform::train(idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT(!is_trained);
+
+    const float* x_in = x;
+    size_t max_train_points = std::max(d_in * max_train_per_dim, 32768);
+    x = fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x);
+
+    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+
+    std::unique_ptr<float[]> x_norm(new float[n * d_in]);
+    { // normalize
+        int d = d_in;
+
+        mean.resize(d, 0);
+        for (idx_t i = 0; i < n; i++) {
+            for (idx_t j = 0; j < d; j++) {
+                mean[j] += x[i * d + j];
+            }
+        }
+        for (idx_t j = 0; j < d; j++) {
+            mean[j] /= n;
+        }
+        for (idx_t i = 0; i < n; i++) {
+            for (idx_t j = 0; j < d; j++) {
+                x_norm[i * d + j] = x[i * d + j] - mean[j];
+            }
+        }
+        fvec_renorm_L2(d_in, n, x_norm.get());
+    }
+
+    // train PCA
+
+    PCAMatrix pca(d_in, d_out);
+    float* x_pca;
+    std::unique_ptr<float[]> x_pca_del;
+    if (do_pca) {
+        pca.have_bias = false; // for consistency with reference implem
+        pca.train(n, x_norm.get());
+        x_pca = pca.apply(n, x_norm.get());
+        x_pca_del.reset(x_pca);
+    } else {
+        x_pca = x_norm.get();
+    }
+
+    // train ITQ
+    itq.train(n, x_pca);
+
+    // merge PCA and ITQ
+    if (do_pca) {
+        FINTEGER di = d_out, dini = d_in;
+        float one = 1, zero = 0;
+        pca_then_itq.A.resize(d_in * d_out);
+        sgemm_("N",
+               "N",
+               &dini,
+               &di,
+               &di,
+               &one,
+               pca.A.data(),
+               &dini,
+               itq.A.data(),
+               &di,
+               &zero,
+               pca_then_itq.A.data(),
+               &dini);
+    } else {
+        pca_then_itq.A = itq.A;
+    }
+    pca_then_itq.is_trained = true;
+    is_trained = true;
+}
+
+void ITQTransform::apply_noalloc(Index::idx_t n, const float* x, float* xt)
+        const {
+    FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
+
+    std::unique_ptr<float[]> x_norm(new float[n * d_in]);
+    { // normalize
+        int d = d_in;
+        for (idx_t i = 0; i < n; i++) {
+            for (idx_t j = 0; j < d; j++) {
+                x_norm[i * d + j] = x[i * d + j] - mean[j];
+            }
+        }
+        // this is not really useful if we are going to binarize right
+        // afterwards but OK
+        fvec_renorm_L2(d_in, n, x_norm.get());
+    }
+
+    pca_then_itq.apply_noalloc(n, x_norm.get(), xt);
+}
+
+/*********************************************
+ * OPQMatrix
+ *********************************************/
+
+OPQMatrix::OPQMatrix(int d, int M, int d2)
+        : LinearTransform(d, d2 == -1 ? d : d2, false),
+          M(M),
+          niter(50),
+          niter_pq(4),
+          niter_pq_0(40),
+          verbose(false),
+          pq(nullptr) {
+    is_trained = false;
+    // OPQ is quite expensive to train, so set this right.
+    max_train_points = 256 * 256;
+    pq = nullptr;
+}
+
+void OPQMatrix::train(Index::idx_t n, const float* x) {
+    const float* x_in = x;
+
+    x = fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x, verbose);
+
+    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+
+    // To support d_out > d_in, we pad input vectors with 0s to d_out
+    size_t d = d_out <= d_in ? d_in : d_out;
+    size_t d2 = d_out;
+
+#if 0
+    // what this test shows: the only way of getting bit-exact
+    // reproducible results with sgeqrf and sgesvd seems to be forcing
+    // single-threading.
+    { // test repro
+        std::vector<float> r (d * d);
+        float * rotation = r.data();
+        float_randn (rotation, d * d, 1234);
+        printf("CS0: %016lx\n",
+               ivec_checksum (128*128, (int*)rotation));
+        matrix_qr (d, d, rotation);
+        printf("CS1: %016lx\n",
+               ivec_checksum (128*128, (int*)rotation));
+        return;
+    }
+#endif
+
+    if (verbose) {
+        printf("OPQMatrix::train: training an OPQ rotation matrix "
+               "for M=%d from %" PRId64 " vectors in %dD -> %dD\n",
+               M,
+               n,
+               d_in,
+               d_out);
+    }
+
+    std::vector<float> xtrain(n * d);
+    // center x
+    {
+        std::vector<float> sum(d);
+        const float* xi = x;
+        for (size_t i = 0; i < n; i++) {
+            for (int j = 0; j < d_in; j++)
+                sum[j] += *xi++;
+        }
+        for (int i = 0; i < d; i++)
+            sum[i] /= n;
+        float* yi = xtrain.data();
+        xi = x;
+        for (size_t i = 0; i < n; i++) {
+            for (int j = 0; j < d_in; j++)
+                *yi++ = *xi++ - sum[j];
+            yi += d - d_in;
+        }
+    }
+    float* rotation;
+
+    if (A.size() == 0) {
+        A.resize(d * d);
+        rotation = A.data();
+        if (verbose)
+            printf("  OPQMatrix::train: making random %zd*%zd rotation\n",
+                   d,
+                   d);
+        float_randn(rotation, d * d, 1234);
+        matrix_qr(d, d, rotation);
+        // we use only the d * d2 upper part of the matrix
+        A.resize(d * d2);
+    } else {
+        FAISS_THROW_IF_NOT(A.size() == d * d2);
+        rotation = A.data();
+    }
+
+    std::vector<float> xproj(d2 * n), pq_recons(d2 * n), xxr(d * n),
+            tmp(d * d * 4);
+
+    ProductQuantizer pq_default(d2, M, 8);
+    ProductQuantizer& pq_regular = pq ? *pq : pq_default;
+    std::vector<uint8_t> codes(pq_regular.code_size * n);
+
+    double t0 = getmillisecs();
+    for (int iter = 0; iter < niter; iter++) {
+        { // torch.mm(xtrain, rotation:t())
+            FINTEGER di = d, d2i = d2, ni = n;
+            float zero = 0, one = 1;
+            sgemm_("Transposed",
+                   "Not transposed",
+                   &d2i,
+                   &ni,
+                   &di,
+                   &one,
+                   rotation,
+                   &di,
+                   xtrain.data(),
+                   &di,
+                   &zero,
+                   xproj.data(),
+                   &d2i);
+        }
+
+        pq_regular.cp.max_points_per_centroid = 1000;
+        pq_regular.cp.niter = iter == 0 ? niter_pq_0 : niter_pq;
+        pq_regular.verbose = verbose;
+        pq_regular.train(n, xproj.data());
+
+        if (verbose) {
+            printf("    encode / decode\n");
+        }
+        if (pq_regular.assign_index) {
+            pq_regular.compute_codes_with_assign_index(
+                    xproj.data(), codes.data(), n);
+        } else {
+            pq_regular.compute_codes(xproj.data(), codes.data(), n);
+        }
+        pq_regular.decode(codes.data(), pq_recons.data(), n);
+
+        float pq_err = fvec_L2sqr(pq_recons.data(), xproj.data(), n * d2) / n;
+
+        if (verbose)
+            printf("    Iteration %d (%d PQ iterations):"
+                   "%.3f s, obj=%g\n",
+                   iter,
+                   pq_regular.cp.niter,
+                   (getmillisecs() - t0) / 1000.0,
+                   pq_err);
+
+        {
+            float *u = tmp.data(), *vt = &tmp[d * d];
+            float* sing_val = &tmp[2 * d * d];
+            FINTEGER di = d, d2i = d2, ni = n;
+            float one = 1, zero = 0;
+
+            if (verbose) {
+                printf("    X * recons\n");
+            }
+            // torch.mm(xtrain:t(), pq_recons)
+            sgemm_("Not",
+                   "Transposed",
+                   &d2i,
+                   &di,
+                   &ni,
+                   &one,
+                   pq_recons.data(),
+                   &d2i,
+                   xtrain.data(),
+                   &di,
+                   &zero,
+                   xxr.data(),
+                   &d2i);
+
+            FINTEGER lwork = -1, info = -1;
+            float worksz;
+            // workspace query
+            sgesvd_("All",
+                    "All",
+                    &d2i,
+                    &di,
+                    xxr.data(),
+                    &d2i,
+                    sing_val,
+                    vt,
+                    &d2i,
+                    u,
+                    &di,
+                    &worksz,
+                    &lwork,
+                    &info);
+
+            lwork = int(worksz);
+            std::vector<float> work(lwork);
+            // u and vt swapped
+            sgesvd_("All",
+                    "All",
+                    &d2i,
+                    &di,
+                    xxr.data(),
+                    &d2i,
+                    sing_val,
+                    vt,
+                    &d2i,
+                    u,
+                    &di,
+                    work.data(),
+                    &lwork,
+                    &info);
+
+            sgemm_("Transposed",
+                   "Transposed",
+                   &di,
+                   &d2i,
+                   &d2i,
+                   &one,
+                   u,
+                   &di,
+                   vt,
+                   &d2i,
+                   &zero,
+                   rotation,
+                   &di);
+        }
+        pq_regular.train_type = ProductQuantizer::Train_hot_start;
+    }
+
+    // revert A matrix
+    if (d > d_in) {
+        for (long i = 0; i < d_out; i++)
+            memmove(&A[i * d_in], &A[i * d], sizeof(A[0]) * d_in);
+        A.resize(d_in * d_out);
+    }
+
+    is_trained = true;
+    is_orthonormal = true;
+}
+
+/*********************************************
+ * NormalizationTransform
+ *********************************************/
+
+NormalizationTransform::NormalizationTransform(int d, float norm)
+        : VectorTransform(d, d), norm(norm) {}
+
+NormalizationTransform::NormalizationTransform()
+        : VectorTransform(-1, -1), norm(-1) {}
+
+void NormalizationTransform::apply_noalloc(idx_t n, const float* x, float* xt)
+        const {
+    if (norm == 2.0) {
+        memcpy(xt, x, sizeof(x[0]) * n * d_in);
+        fvec_renorm_L2(d_in, n, xt);
+    } else {
+        FAISS_THROW_MSG("not implemented");
+    }
+}
+
+void NormalizationTransform::reverse_transform(
+        idx_t n,
+        const float* xt,
+        float* x) const {
+    memcpy(x, xt, sizeof(xt[0]) * n * d_in);
+}
+
+/*********************************************
+ * CenteringTransform
+ *********************************************/
+
+CenteringTransform::CenteringTransform(int d) : VectorTransform(d, d) {
+    is_trained = false;
+}
+
+void CenteringTransform::train(Index::idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT_MSG(n > 0, "need at least one training vector");
+    mean.resize(d_in, 0);
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            mean[j] += *x++;
+        }
+    }
+
+    for (size_t j = 0; j < d_in; j++) {
+        mean[j] /= n;
+    }
+    is_trained = true;
+}
+
+void CenteringTransform::apply_noalloc(idx_t n, const float* x, float* xt)
+        const {
+    FAISS_THROW_IF_NOT(is_trained);
+
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            *xt++ = *x++ - mean[j];
+        }
+    }
+}
+
+void CenteringTransform::reverse_transform(idx_t n, const float* xt, float* x)
+        const {
+    FAISS_THROW_IF_NOT(is_trained);
+
+    for (idx_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d_in; j++) {
+            *x++ = *xt++ + mean[j];
+        }
+    }
+}
+
+/*********************************************
+ * RemapDimensionsTransform
+ *********************************************/
+
+RemapDimensionsTransform::RemapDimensionsTransform(
+        int d_in,
+        int d_out,
+        const int* map_in)
+        : VectorTransform(d_in, d_out) {
+    map.resize(d_out);
+    for (int i = 0; i < d_out; i++) {
+        map[i] = map_in[i];
+        FAISS_THROW_IF_NOT(map[i] == -1 || (map[i] >= 0 && map[i] < d_in));
+    }
+}
+
+RemapDimensionsTransform::RemapDimensionsTransform(
+        int d_in,
+        int d_out,
+        bool uniform)
+        : VectorTransform(d_in, d_out) {
+    map.resize(d_out, -1);
+
+    if (uniform) {
+        if (d_in < d_out) {
+            for (int i = 0; i < d_in; i++) {
+                map[i * d_out / d_in] = i;
+            }
+        } else {
+            for (int i = 0; i < d_out; i++) {
+                map[i] = i * d_in / d_out;
+            }
+        }
+    } else {
+        for (int i = 0; i < d_in && i < d_out; i++)
+            map[i] = i;
+    }
+}
+
+void RemapDimensionsTransform::apply_noalloc(idx_t n, const float* x, float* xt)
+        const {
+    for (idx_t i = 0; i < n; i++) {
+        for (int j = 0; j < d_out; j++) {
+            xt[j] = map[j] < 0 ? 0 : x[map[j]];
+        }
+        x += d_in;
+        xt += d_out;
+    }
+}
+
+void RemapDimensionsTransform::reverse_transform(
+        idx_t n,
+        const float* xt,
+        float* x) const {
+    memset(x, 0, sizeof(*x) * n * d_in);
+    for (idx_t i = 0; i < n; i++) {
+        for (int j = 0; j < d_out; j++) {
+            if (map[j] >= 0)
+                x[map[j]] = xt[j];
+        }
+        x += d_in;
+        xt += d_out;
+    }
+}
--- a/faiss/VectorTransform.h
+++ b/faiss/VectorTransform.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_VECTOR_TRANSFORM_H
+#define FAISS_VECTOR_TRANSFORM_H
+
+/** Defines a few objects that apply transformations to a set of
+ * vectors Often these are pre-processing steps.
+ */
+
+#include <stdint.h>
+#include <vector>
+
+#include <faiss/Index.h>
+
+namespace faiss {
+
+/** Any transformation applied on a set of vectors */
+struct VectorTransform {
+    typedef Index::idx_t idx_t;
+
+    int d_in;  ///! input dimension
+    int d_out; ///! output dimension
+
+    explicit VectorTransform(int d_in = 0, int d_out = 0)
+            : d_in(d_in), d_out(d_out), is_trained(true) {}
+
+    /// set if the VectorTransform does not require training, or if
+    /// training is done already
+    bool is_trained;
+
+    /** Perform training on a representative set of vectors. Does
+     * nothing by default.
+     *
+     * @param n      nb of training vectors
+     * @param x      training vecors, size n * d
+     */
+    virtual void train(idx_t n, const float* x);
+
+    /** apply the random rotation, return new allocated matrix
+     * @param     x size n * d_in
+     * @return    size n * d_out
+     */
+    float* apply(idx_t n, const float* x) const;
+
+    /// same as apply, but result is pre-allocated
+    virtual void apply_noalloc(idx_t n, const float* x, float* xt) const = 0;
+
+    /// reverse transformation. May not be implemented or may return
+    /// approximate result
+    virtual void reverse_transform(idx_t n, const float* xt, float* x) const;
+
+    virtual ~VectorTransform() {}
+};
+
+/** Generic linear transformation, with bias term applied on output
+ * y = A * x + b
+ */
+struct LinearTransform : VectorTransform {
+    bool have_bias; ///! whether to use the bias term
+
+    /// check if matrix A is orthonormal (enables reverse_transform)
+    bool is_orthonormal;
+
+    /// Transformation matrix, size d_out * d_in
+    std::vector<float> A;
+
+    /// bias vector, size d_out
+    std::vector<float> b;
+
+    /// both d_in > d_out and d_out < d_in are supported
+    explicit LinearTransform(
+            int d_in = 0,
+            int d_out = 0,
+            bool have_bias = false);
+
+    /// same as apply, but result is pre-allocated
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// compute x = A^T * (x - b)
+    /// is reverse transform if A has orthonormal lines
+    void transform_transpose(idx_t n, const float* y, float* x) const;
+
+    /// works only if is_orthonormal
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+
+    /// compute A^T * A to set the is_orthonormal flag
+    void set_is_orthonormal();
+
+    bool verbose;
+    void print_if_verbose(
+            const char* name,
+            const std::vector<double>& mat,
+            int n,
+            int d) const;
+
+    ~LinearTransform() override {}
+};
+
+/// Randomly rotate a set of vectors
+struct RandomRotationMatrix : LinearTransform {
+    /// both d_in > d_out and d_out < d_in are supported
+    RandomRotationMatrix(int d_in, int d_out)
+            : LinearTransform(d_in, d_out, false) {}
+
+    /// must be called before the transform is used
+    void init(int seed);
+
+    // intializes with an arbitrary seed
+    void train(idx_t n, const float* x) override;
+
+    RandomRotationMatrix() {}
+};
+
+/** Applies a principal component analysis on a set of vectors,
+ *  with optionally whitening and random rotation. */
+struct PCAMatrix : LinearTransform {
+    /** after transformation the components are multiplied by
+     * eigenvalues^eigen_power
+     *
+     * =0: no whitening
+     * =-0.5: full whitening
+     */
+    float eigen_power;
+
+    /// value added to eigenvalues to avoid division by 0 when whitening
+    float epsilon;
+
+    /// random rotation after PCA
+    bool random_rotation;
+
+    /// ratio between # training vectors and dimension
+    size_t max_points_per_d;
+
+    /// try to distribute output eigenvectors in this many bins
+    int balanced_bins;
+
+    /// Mean, size d_in
+    std::vector<float> mean;
+
+    /// eigenvalues of covariance matrix (= squared singular values)
+    std::vector<float> eigenvalues;
+
+    /// PCA matrix, size d_in * d_in
+    std::vector<float> PCAMat;
+
+    // the final matrix is computed after random rotation and/or whitening
+    explicit PCAMatrix(
+            int d_in = 0,
+            int d_out = 0,
+            float eigen_power = 0,
+            bool random_rotation = false);
+
+    /// train on n vectors. If n < d_in then the eigenvector matrix
+    /// will be completed with 0s
+    void train(idx_t n, const float* x) override;
+
+    /// copy pre-trained PCA matrix
+    void copy_from(const PCAMatrix& other);
+
+    /// called after mean, PCAMat and eigenvalues are computed
+    void prepare_Ab();
+};
+
+/** ITQ implementation from
+ *
+ *     Iterative quantization: A procrustean approach to learning binary codes
+ *     for large-scale image retrieval,
+ *
+ * Yunchao Gong, Svetlana Lazebnik, Albert Gordo, Florent Perronnin,
+ * PAMI'12.
+ */
+
+struct ITQMatrix : LinearTransform {
+    int max_iter;
+    int seed;
+
+    // force initialization of the rotation (for debugging)
+    std::vector<double> init_rotation;
+
+    explicit ITQMatrix(int d = 0);
+
+    void train(idx_t n, const float* x) override;
+};
+
+/** The full ITQ transform, including normalizations and PCA transformation
+ */
+struct ITQTransform : VectorTransform {
+    std::vector<float> mean;
+    bool do_pca;
+    ITQMatrix itq;
+
+    /// max training points per dimension
+    int max_train_per_dim;
+
+    // concatenation of PCA + ITQ transformation
+    LinearTransform pca_then_itq;
+
+    explicit ITQTransform(int d_in = 0, int d_out = 0, bool do_pca = false);
+
+    void train(idx_t n, const float* x) override;
+
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+};
+
+struct ProductQuantizer;
+
+/** Applies a rotation to align the dimensions with a PQ to minimize
+ *  the reconstruction error. Can be used before an IndexPQ or an
+ *  IndexIVFPQ. The method is the non-parametric version described in:
+ *
+ * "Optimized Product Quantization for Approximate Nearest Neighbor Search"
+ * Tiezheng Ge, Kaiming He, Qifa Ke, Jian Sun, CVPR'13
+ *
+ */
+struct OPQMatrix : LinearTransform {
+    int M;          ///< nb of subquantizers
+    int niter;      ///< Number of outer training iterations
+    int niter_pq;   ///< Number of training iterations for the PQ
+    int niter_pq_0; ///< same, for the first outer iteration
+
+    /// if there are too many training points, resample
+    size_t max_train_points;
+    bool verbose;
+
+    /// if non-NULL, use this product quantizer for training
+    /// should be constructed with (d_out, M, _)
+    ProductQuantizer* pq;
+
+    /// if d2 != -1, output vectors of this dimension
+    explicit OPQMatrix(int d = 0, int M = 1, int d2 = -1);
+
+    void train(idx_t n, const float* x) override;
+};
+
+/** remap dimensions for intput vectors, possibly inserting 0s
+ * strictly speaking this is also a linear transform but we don't want
+ * to compute it with matrix multiplies */
+struct RemapDimensionsTransform : VectorTransform {
+    /// map from output dimension to input, size d_out
+    /// -1 -> set output to 0
+    std::vector<int> map;
+
+    RemapDimensionsTransform(int d_in, int d_out, const int* map);
+
+    /// remap input to output, skipping or inserting dimensions as needed
+    /// if uniform: distribute dimensions uniformly
+    /// otherwise just take the d_out first ones.
+    RemapDimensionsTransform(int d_in, int d_out, bool uniform = true);
+
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// reverse transform correct only when the mapping is a permutation
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+
+    RemapDimensionsTransform() {}
+};
+
+/** per-vector normalization */
+struct NormalizationTransform : VectorTransform {
+    float norm;
+
+    explicit NormalizationTransform(int d, float norm = 2.0);
+    NormalizationTransform();
+
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// Identity transform since norm is not revertible
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+};
+
+/** Subtract the mean of each component from the vectors. */
+struct CenteringTransform : VectorTransform {
+    /// Mean, size d_in = d_out
+    std::vector<float> mean;
+
+    explicit CenteringTransform(int d = 0);
+
+    /// train on n vectors.
+    void train(idx_t n, const float* x) override;
+
+    /// subtract the mean
+    void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    /// add the mean
+    void reverse_transform(idx_t n, const float* xt, float* x) const override;
+};
+
+} // namespace faiss
+
+#endif
--- a/faiss/clone_index.cpp
+++ b/faiss/clone_index.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/clone_index.h>
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <faiss/impl/FaissAssert.h>
+
+#include <faiss/Index2Layer.h>
+#include <faiss/IndexAdditiveQuantizer.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexHNSW.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQR.h>
+#include <faiss/IndexIVFSpectralHash.h>
+#include <faiss/IndexLSH.h>
+#include <faiss/IndexLattice.h>
+#include <faiss/IndexNSG.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/VectorTransform.h>
+
+namespace faiss {
+
+/*************************************************************
+ * cloning functions
+ **************************************************************/
+
+Index* clone_index(const Index* index) {
+    Cloner cl;
+    return cl.clone_Index(index);
+}
+
+// assumes there is a copy constructor ready. Always try from most
+// specific to most general. Most indexes don't have complicated
+// structs, the default copy constructor often just works.
+#define TRYCLONE(classname, obj)                                      \
+    if (const classname* clo = dynamic_cast<const classname*>(obj)) { \
+        return new classname(*clo);                                   \
+    } else
+
+VectorTransform* Cloner::clone_VectorTransform(const VectorTransform* vt) {
+    TRYCLONE(RemapDimensionsTransform, vt)
+    TRYCLONE(OPQMatrix, vt)
+    TRYCLONE(PCAMatrix, vt)
+    TRYCLONE(ITQMatrix, vt)
+    TRYCLONE(RandomRotationMatrix, vt)
+    TRYCLONE(LinearTransform, vt) {
+        FAISS_THROW_MSG("clone not supported for this type of VectorTransform");
+    }
+    return nullptr;
+}
+
+IndexIVF* Cloner::clone_IndexIVF(const IndexIVF* ivf) {
+    TRYCLONE(IndexIVFPQR, ivf)
+    TRYCLONE(IndexIVFPQ, ivf)
+    TRYCLONE(IndexIVFFlat, ivf)
+    TRYCLONE(IndexIVFScalarQuantizer, ivf) {
+        FAISS_THROW_MSG("clone not supported for this type of IndexIVF");
+    }
+    return nullptr;
+}
+
+Index* Cloner::clone_Index(const Index* index) {
+    TRYCLONE(IndexPQ, index)
+    TRYCLONE(IndexLSH, index)
+    TRYCLONE(IndexFlatL2, index)
+    TRYCLONE(IndexFlatIP, index)
+    TRYCLONE(IndexFlat, index)
+    TRYCLONE(IndexLattice, index)
+    TRYCLONE(IndexResidualQuantizer, index)
+    TRYCLONE(IndexScalarQuantizer, index)
+    TRYCLONE(MultiIndexQuantizer, index)
+    TRYCLONE(ResidualCoarseQuantizer, index)
+    if (const IndexIVF* ivf = dynamic_cast<const IndexIVF*>(index)) {
+        IndexIVF* res = clone_IndexIVF(ivf);
+        if (ivf->invlists == nullptr) {
+            res->invlists = nullptr;
+        } else if (
+                auto* ails = dynamic_cast<const ArrayInvertedLists*>(
+                        ivf->invlists)) {
+            res->invlists = new ArrayInvertedLists(*ails);
+            res->own_invlists = true;
+        } else {
+            FAISS_THROW_MSG(
+                    "clone not supported for this type of inverted lists");
+        }
+        res->own_fields = true;
+        res->quantizer = clone_Index(ivf->quantizer);
+        return res;
+    } else if (
+            const IndexPreTransform* ipt =
+                    dynamic_cast<const IndexPreTransform*>(index)) {
+        IndexPreTransform* res = new IndexPreTransform();
+        res->d = ipt->d;
+        res->ntotal = ipt->ntotal;
+        res->is_trained = ipt->is_trained;
+        res->metric_type = ipt->metric_type;
+        res->metric_arg = ipt->metric_arg;
+
+        res->index = clone_Index(ipt->index);
+        for (int i = 0; i < ipt->chain.size(); i++)
+            res->chain.push_back(clone_VectorTransform(ipt->chain[i]));
+        res->own_fields = true;
+        return res;
+    } else if (
+            const IndexIDMap* idmap = dynamic_cast<const IndexIDMap*>(index)) {
+        IndexIDMap* res = new IndexIDMap(*idmap);
+        res->own_fields = true;
+        res->index = clone_Index(idmap->index);
+        return res;
+    } else if (const IndexHNSW* ihnsw = dynamic_cast<const IndexHNSW*>(index)) {
+        IndexHNSW* res = new IndexHNSW(*ihnsw);
+        res->own_fields = true;
+        res->storage = clone_Index(ihnsw->storage);
+        return res;
+    } else if (const IndexNSG* insg = dynamic_cast<const IndexNSG*>(index)) {
+        IndexNSG* res = new IndexNSG(*insg);
+
+        // copy the dynamic allocated graph
+        auto& new_graph = res->nsg.final_graph;
+        auto& old_graph = insg->nsg.final_graph;
+        new_graph = std::make_shared<nsg::Graph<int>>(*old_graph);
+
+        res->own_fields = true;
+        res->storage = clone_Index(insg->storage);
+        return res;
+    } else if (
+            const Index2Layer* i2l = dynamic_cast<const Index2Layer*>(index)) {
+        Index2Layer* res = new Index2Layer(*i2l);
+        res->q1.own_fields = true;
+        res->q1.quantizer = clone_Index(i2l->q1.quantizer);
+        return res;
+    } else {
+        FAISS_THROW_MSG("clone not supported for this type of Index");
+    }
+    return nullptr;
+}
+
+} // namespace faiss
--- a/faiss/clone_index.h
+++ b/faiss/clone_index.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+// I/O code for indexes
+
+#pragma once
+
+namespace faiss {
+
+struct Index;
+struct IndexIVF;
+struct VectorTransform;
+
+/* cloning functions */
+Index* clone_index(const Index*);
+
+/** Cloner class, useful to override classes with other cloning
+ * functions. The cloning function above just calls
+ * Cloner::clone_Index. */
+struct Cloner {
+    virtual VectorTransform* clone_VectorTransform(const VectorTransform*);
+    virtual Index* clone_Index(const Index*);
+    virtual IndexIVF* clone_IndexIVF(const IndexIVF*);
+    virtual ~Cloner() {}
+};
+
+} // namespace faiss
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set(FAISS_GPU_SRC
+  GpuAutoTune.cpp
+  GpuCloner.cpp
+  GpuClonerOptions.cpp
+  # HC
+  #GpuDistance.cu
+  GpuIcmEncoder.cu
+  GpuIndex.cu
+  GpuIndexBinaryFlat.cu
+  GpuIndexFlat.cu
+  GpuIndexIVF.cu
+  GpuIndexIVFFlat.cu
+  GpuIndexIVFPQ.cu
+  # HC
+  #GpuIndexIVFScalarQuantizer.cu
+  GpuResources.cpp
+  StandardGpuResources.cpp
+  # HC
+  #impl/BinaryDistance.cu
+  impl/BinaryFlatIndex.cu
+  impl/BroadcastSum.cu
+  impl/Distance.cu
+  # HC
+  #impl/FlatIndex.cu
+  impl/IVFAppend.cu
+  impl/IVFBase.cu
+  impl/IVFFlat.cu
+  impl/IVFFlatScan.cu
+  # HC
+  #impl/IVFInterleaved.cu
+  #impl/IVFPQ.cu
+  impl/IVFUtils.cu
+  #impl/IVFUtilsSelect1.cu
+  #impl/IVFUtilsSelect2.cu
+  impl/InterleavedCodes.cpp
+  # HC
+  #impl/L2Norm.cu
+  #impl/L2Select.cu
+  #impl/PQScanMultiPassPrecomputed.cu
+  impl/RemapIndices.cpp
+  impl/VectorResidual.cu
+  # HC
+  #impl/scan/IVFInterleaved1.cu
+  #impl/scan/IVFInterleaved32.cu
+  #impl/scan/IVFInterleaved64.cu
+  #impl/scan/IVFInterleaved128.cu
+  #impl/scan/IVFInterleaved256.cu
+  #impl/scan/IVFInterleaved512.cu
+  #impl/scan/IVFInterleaved1024.cu
+  #impl/scan/IVFInterleaved2048.cu
+  #impl/IcmEncoder.cu
+  #utils/BlockSelectFloat.cu
+  utils/DeviceUtils.cu
+  utils/StackDeviceMemory.cpp
+  utils/Timer.cpp
+  utils/WarpSelectFloat.cu
+  # HC
+  #utils/blockselect/BlockSelectFloat1.cu
+  #utils/blockselect/BlockSelectFloat32.cu
+  #utils/blockselect/BlockSelectFloat64.cu
+  #utils/blockselect/BlockSelectFloat128.cu
+  #utils/blockselect/BlockSelectFloat256.cu
+  #utils/blockselect/BlockSelectFloatF512.cu
+  #utils/blockselect/BlockSelectFloatF1024.cu
+  #utils/blockselect/BlockSelectFloatF2048.cu
+  #utils/blockselect/BlockSelectFloatT512.cu
+  #utils/blockselect/BlockSelectFloatT1024.cu
+  #utils/blockselect/BlockSelectFloatT2048.cu
+  #utils/warpselect/WarpSelectFloat1.cu
+  #utils/warpselect/WarpSelectFloat32.cu
+  #utils/warpselect/WarpSelectFloat64.cu
+  #utils/warpselect/WarpSelectFloat128.cu
+  #utils/warpselect/WarpSelectFloat256.cu
+  #utils/warpselect/WarpSelectFloatF512.cu
+  #utils/warpselect/WarpSelectFloatF1024.cu
+  #utils/warpselect/WarpSelectFloatF2048.cu
+  #utils/warpselect/WarpSelectFloatT512.cu
+  #utils/warpselect/WarpSelectFloatT1024.cu
+  #utils/warpselect/WarpSelectFloatT2048.cu
+)
+
+set(FAISS_GPU_HEADERS
+  GpuAutoTune.h
+  GpuCloner.h
+  GpuClonerOptions.h
+  GpuDistance.h
+  GpuIcmEncoder.h
+  GpuFaissAssert.h
+  GpuIndex.h
+  GpuIndexBinaryFlat.h
+  GpuIndexFlat.h
+  GpuIndexIVF.h
+  GpuIndexIVFFlat.h
+  GpuIndexIVFPQ.h
+  GpuIndexIVFScalarQuantizer.h
+  GpuIndicesOptions.h
+  GpuResources.h
+  StandardGpuResources.h
+  impl/BinaryDistance.cuh
+  impl/BinaryFlatIndex.cuh
+  impl/BroadcastSum.cuh
+  impl/Distance.cuh
+  impl/DistanceUtils.cuh
+  impl/FlatIndex.cuh
+  impl/GeneralDistance.cuh
+  impl/GpuScalarQuantizer.cuh
+  impl/IVFAppend.cuh
+  impl/IVFBase.cuh
+  impl/IVFFlat.cuh
+  impl/IVFFlatScan.cuh
+  impl/IVFInterleaved.cuh
+  impl/IVFPQ.cuh
+  impl/IVFUtils.cuh
+  impl/InterleavedCodes.h
+  impl/L2Norm.cuh
+  impl/L2Select.cuh
+  impl/PQCodeDistances-inl.cuh
+  impl/PQCodeDistances.cuh
+  impl/PQCodeLoad.cuh
+  impl/PQScanMultiPassNoPrecomputed-inl.cuh
+  impl/PQScanMultiPassNoPrecomputed.cuh
+  impl/PQScanMultiPassPrecomputed.cuh
+  impl/RemapIndices.h
+  impl/VectorResidual.cuh
+  impl/scan/IVFInterleavedImpl.cuh
+  impl/IcmEncoder.cuh
+  utils/BlockSelectKernel.cuh
+  utils/Comparators.cuh
+  utils/ConversionOperators.cuh
+  utils/CopyUtils.cuh
+  utils/DeviceDefs.cuh
+  utils/DeviceTensor-inl.cuh
+  utils/DeviceTensor.cuh
+  utils/DeviceUtils.h
+  utils/DeviceVector.cuh
+  utils/Float16.cuh
+  utils/HostTensor-inl.cuh
+  utils/HostTensor.cuh
+  utils/Limits.cuh
+  utils/LoadStoreOperators.cuh
+  utils/MathOperators.cuh
+  utils/MatrixMult-inl.cuh
+  utils/MatrixMult.cuh
+  utils/MergeNetworkBlock.cuh
+  utils/MergeNetworkUtils.cuh
+  utils/MergeNetworkWarp.cuh
+  utils/NoTypeTensor.cuh
+  utils/Pair.cuh
+  utils/PtxUtils.cuh
+  utils/ReductionOperators.cuh
+  utils/Reductions.cuh
+  utils/Select.cuh
+  utils/StackDeviceMemory.h
+  utils/StaticUtils.h
+  utils/Tensor-inl.cuh
+  utils/Tensor.cuh
+  utils/ThrustAllocator.cuh
+  utils/Timer.h
+  utils/Transpose.cuh
+  utils/WarpPackedBits.cuh
+  utils/WarpSelectKernel.cuh
+  utils/WarpShuffles.cuh
+  utils/blockselect/BlockSelectImpl.cuh
+  utils/warpselect/WarpSelectImpl.cuh
+)
+
+# Export FAISS_GPU_HEADERS variable to parent scope.
+set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)
+
+target_sources(faiss PRIVATE ${FAISS_GPU_SRC})
+target_sources(faiss_avx2 PRIVATE ${FAISS_GPU_SRC})
+
+foreach(header ${FAISS_GPU_HEADERS})
+  get_filename_component(dir ${header} DIRECTORY )
+  install(FILES ${header}
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/faiss/gpu/${dir}
+  )
+endforeach()
+
+# HC
+#find_package(CUDAToolkit REQUIRED)
+target_link_libraries(faiss)
+target_link_libraries(faiss_avx2)
+#target_link_libraries(faiss_avx2 PRIVATE hipblas)
+#target_compile_options(faiss PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
+#target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
--- a/faiss/gpu/GpuAutoTune.cpp
+++ b/faiss/gpu/GpuAutoTune.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuAutoTune.h>
+#include <typeinfo>
+
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexReplicas.h>
+#include <faiss/IndexShards.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+namespace gpu {
+
+using namespace ::faiss;
+
+/**********************************************************
+ * Parameters to auto-tune on GpuIndex'es
+ **********************************************************/
+
+#define DC(classname) auto ix = dynamic_cast<const classname*>(index)
+
+void GpuParameterSpace::initialize(const Index* index) {
+    if (DC(IndexPreTransform)) {
+        index = ix->index;
+    }
+    if (DC(IndexReplicas)) {
+        if (ix->count() == 0)
+            return;
+        index = ix->at(0);
+    }
+    if (DC(IndexShards)) {
+        if (ix->count() == 0)
+            return;
+        index = ix->at(0);
+    }
+    if (DC(GpuIndexIVF)) {
+        ParameterRange& pr = add_range("nprobe");
+        for (int i = 0; i < 12; i++) {
+            size_t nprobe = 1 << i;
+            if (nprobe >= ix->getNumLists() || nprobe > getMaxKSelection())
+                break;
+            pr.values.push_back(nprobe);
+        }
+    }
+    // not sure we should call the parent initializer
+}
+
+#undef DC
+// non-const version
+#define DC(classname) auto* ix = dynamic_cast<classname*>(index)
+
+void GpuParameterSpace::set_index_parameter(
+        Index* index,
+        const std::string& name,
+        double val) const {
+    if (DC(IndexReplicas)) {
+        for (int i = 0; i < ix->count(); i++)
+            set_index_parameter(ix->at(i), name, val);
+        return;
+    }
+    if (name == "nprobe") {
+        if (DC(GpuIndexIVF)) {
+            ix->setNumProbes(int(val));
+            return;
+        }
+    }
+    if (name == "use_precomputed_table") {
+        if (DC(GpuIndexIVFPQ)) {
+            ix->setPrecomputedCodes(bool(val));
+            return;
+        }
+    }
+
+    // maybe normal index parameters apply?
+    ParameterSpace::set_index_parameter(index, name, val);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuAutoTune.cpp.prehip
+++ b/faiss/gpu/GpuAutoTune.cpp.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuAutoTune.h>
+#include <typeinfo>
+
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexReplicas.h>
+#include <faiss/IndexShards.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+namespace gpu {
+
+using namespace ::faiss;
+
+/**********************************************************
+ * Parameters to auto-tune on GpuIndex'es
+ **********************************************************/
+
+#define DC(classname) auto ix = dynamic_cast<const classname*>(index)
+
+void GpuParameterSpace::initialize(const Index* index) {
+    if (DC(IndexPreTransform)) {
+        index = ix->index;
+    }
+    if (DC(IndexReplicas)) {
+        if (ix->count() == 0)
+            return;
+        index = ix->at(0);
+    }
+    if (DC(IndexShards)) {
+        if (ix->count() == 0)
+            return;
+        index = ix->at(0);
+    }
+    if (DC(GpuIndexIVF)) {
+        ParameterRange& pr = add_range("nprobe");
+        for (int i = 0; i < 12; i++) {
+            size_t nprobe = 1 << i;
+            if (nprobe >= ix->getNumLists() || nprobe > getMaxKSelection())
+                break;
+            pr.values.push_back(nprobe);
+        }
+    }
+    // not sure we should call the parent initializer
+}
+
+#undef DC
+// non-const version
+#define DC(classname) auto* ix = dynamic_cast<classname*>(index)
+
+void GpuParameterSpace::set_index_parameter(
+        Index* index,
+        const std::string& name,
+        double val) const {
+    if (DC(IndexReplicas)) {
+        for (int i = 0; i < ix->count(); i++)
+            set_index_parameter(ix->at(i), name, val);
+        return;
+    }
+    if (name == "nprobe") {
+        if (DC(GpuIndexIVF)) {
+            ix->setNumProbes(int(val));
+            return;
+        }
+    }
+    if (name == "use_precomputed_table") {
+        if (DC(GpuIndexIVFPQ)) {
+            ix->setPrecomputedCodes(bool(val));
+            return;
+        }
+    }
+
+    // maybe normal index parameters apply?
+    ParameterSpace::set_index_parameter(index, name, val);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuAutoTune.h
+++ b/faiss/gpu/GpuAutoTune.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/AutoTune.h>
+#include <faiss/Index.h>
+
+namespace faiss {
+namespace gpu {
+
+/// parameter space and setters for GPU indexes
+struct GpuParameterSpace : faiss::ParameterSpace {
+    /// initialize with reasonable parameters for the index
+    void initialize(const faiss::Index* index) override;
+
+    /// set a combination of parameters on an index
+    void set_index_parameter(
+            faiss::Index* index,
+            const std::string& name,
+            double val) const override;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuAutoTune.h.prehip
+++ b/faiss/gpu/GpuAutoTune.h.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/AutoTune.h>
+#include <faiss/Index.h>
+
+namespace faiss {
+namespace gpu {
+
+/// parameter space and setters for GPU indexes
+struct GpuParameterSpace : faiss::ParameterSpace {
+    /// initialize with reasonable parameters for the index
+    void initialize(const faiss::Index* index) override;
+
+    /// set a combination of parameters on an index
+    void set_index_parameter(
+            faiss::Index* index,
+            const std::string& name,
+            double val) const override;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuCloner.h>
+#include <faiss/impl/FaissAssert.h>
+#include <typeinfo>
+
+#include <faiss/gpu/StandardGpuResources.h>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexReplicas.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/index_io.h>
+
+namespace faiss {
+namespace gpu {
+
+/**********************************************************
+ * Cloning to CPU
+ **********************************************************/
+
+void ToCPUCloner::merge_index(Index* dst, Index* src, bool successive_ids) {
+    if (auto ifl = dynamic_cast<IndexFlat*>(dst)) {
+        auto ifl2 = dynamic_cast<const IndexFlat*>(src);
+        FAISS_ASSERT(ifl2);
+        FAISS_ASSERT(successive_ids);
+        ifl->add(ifl2->ntotal, ifl2->get_xb());
+    } else if (auto ifl = dynamic_cast<IndexIVFFlat*>(dst)) {
+        auto ifl2 = dynamic_cast<IndexIVFFlat*>(src);
+        FAISS_ASSERT(ifl2);
+        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
+    } else if (auto ifl = dynamic_cast<IndexIVFScalarQuantizer*>(dst)) {
+        auto ifl2 = dynamic_cast<IndexIVFScalarQuantizer*>(src);
+        FAISS_ASSERT(ifl2);
+        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
+    } else if (auto ifl = dynamic_cast<IndexIVFPQ*>(dst)) {
+        auto ifl2 = dynamic_cast<IndexIVFPQ*>(src);
+        FAISS_ASSERT(ifl2);
+        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
+    } else {
+        FAISS_ASSERT(!"merging not implemented for this type of class");
+    }
+}
+
+Index* ToCPUCloner::clone_Index(const Index* index) {
+    if (auto ifl = dynamic_cast<const GpuIndexFlat*>(index)) {
+        IndexFlat* res = new IndexFlat();
+        ifl->copyTo(res);
+        return res;
+    } else if (auto ifl = dynamic_cast<const GpuIndexIVFFlat*>(index)) {
+        IndexIVFFlat* res = new IndexIVFFlat();
+        ifl->copyTo(res);
+        return res;
+    } else if (
+            auto ifl = dynamic_cast<const GpuIndexIVFScalarQuantizer*>(index)) {
+        IndexIVFScalarQuantizer* res = new IndexIVFScalarQuantizer();
+        ifl->copyTo(res);
+        return res;
+    } else if (auto ipq = dynamic_cast<const GpuIndexIVFPQ*>(index)) {
+        IndexIVFPQ* res = new IndexIVFPQ();
+        ipq->copyTo(res);
+        return res;
+
+        // for IndexShards and IndexReplicas we assume that the
+        // objective is to make a single component out of them
+        // (inverse op of ToGpuClonerMultiple)
+
+    } else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
+        int nshard = ish->count();
+        FAISS_ASSERT(nshard > 0);
+        Index* res = clone_Index(ish->at(0));
+        for (int i = 1; i < ish->count(); i++) {
+            Index* res_i = clone_Index(ish->at(i));
+            merge_index(res, res_i, ish->successive_ids);
+            delete res_i;
+        }
+        return res;
+    } else if (auto ipr = dynamic_cast<const IndexReplicas*>(index)) {
+        // just clone one of the replicas
+        FAISS_ASSERT(ipr->count() > 0);
+        return clone_Index(ipr->at(0));
+    } else {
+        return Cloner::clone_Index(index);
+    }
+}
+
+faiss::Index* index_gpu_to_cpu(const faiss::Index* gpu_index) {
+    ToCPUCloner cl;
+    return cl.clone_Index(gpu_index);
+}
+
+/**********************************************************
+ * Cloning to 1 GPU
+ **********************************************************/
+
+ToGpuCloner::ToGpuCloner(
+        GpuResourcesProvider* prov,
+        int device,
+        const GpuClonerOptions& options)
+        : GpuClonerOptions(options), provider(prov), device(device) {}
+
+Index* ToGpuCloner::clone_Index(const Index* index) {
+    using idx_t = Index::idx_t;
+    if (auto ifl = dynamic_cast<const IndexFlat*>(index)) {
+        GpuIndexFlatConfig config;
+        config.device = device;
+        config.useFloat16 = useFloat16;
+        config.storeTransposed = storeTransposed;
+        return new GpuIndexFlat(provider, ifl, config);
+    } else if (
+            dynamic_cast<const IndexScalarQuantizer*>(index) &&
+            static_cast<const IndexScalarQuantizer*>(index)->sq.qtype ==
+                    ScalarQuantizer::QT_fp16) {
+        GpuIndexFlatConfig config;
+        config.device = device;
+        config.useFloat16 = true;
+        GpuIndexFlat* gif = new GpuIndexFlat(
+                provider, index->d, index->metric_type, config);
+        // transfer data by blocks
+        idx_t bs = 1024 * 1024;
+        for (idx_t i0 = 0; i0 < index->ntotal; i0 += bs) {
+            idx_t i1 = std::min(i0 + bs, index->ntotal);
+            std::vector<float> buffer((i1 - i0) * index->d);
+            index->reconstruct_n(i0, i1 - i0, buffer.data());
+            gif->add(i1 - i0, buffer.data());
+        }
+        assert(gif->getNumVecs() == index->ntotal);
+        return gif;
+    } else if (auto ifl = dynamic_cast<const faiss::IndexIVFFlat*>(index)) {
+        GpuIndexIVFFlatConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+
+        GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
+                provider, ifl->d, ifl->nlist, ifl->metric_type, config);
+        if (reserveVecs > 0 && ifl->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        res->copyFrom(ifl);
+        return res;
+    } else if (
+            auto ifl = dynamic_cast<const faiss::IndexIVFScalarQuantizer*>(
+                    index)) {
+        GpuIndexIVFScalarQuantizerConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+
+        GpuIndexIVFScalarQuantizer* res = new GpuIndexIVFScalarQuantizer(
+                provider,
+                ifl->d,
+                ifl->nlist,
+                ifl->sq.qtype,
+                ifl->metric_type,
+                ifl->by_residual,
+                config);
+        if (reserveVecs > 0 && ifl->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        res->copyFrom(ifl);
+        return res;
+    } else if (auto ipq = dynamic_cast<const faiss::IndexIVFPQ*>(index)) {
+        if (verbose) {
+            printf("  IndexIVFPQ size %ld -> GpuIndexIVFPQ "
+                   "indicesOptions=%d "
+                   "usePrecomputed=%d useFloat16=%d reserveVecs=%ld\n",
+                   ipq->ntotal,
+                   indicesOptions,
+                   usePrecomputed,
+                   useFloat16,
+                   reserveVecs);
+        }
+        GpuIndexIVFPQConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+        config.useFloat16LookupTables = useFloat16;
+        config.usePrecomputedTables = usePrecomputed;
+
+        GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
+
+        if (reserveVecs > 0 && ipq->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        return res;
+    } else {
+        // default: use CPU cloner
+        return Cloner::clone_Index(index);
+    }
+}
+
+faiss::Index* index_cpu_to_gpu(
+        GpuResourcesProvider* provider,
+        int device,
+        const faiss::Index* index,
+        const GpuClonerOptions* options) {
+    GpuClonerOptions defaults;
+    ToGpuCloner cl(provider, device, options ? *options : defaults);
+    return cl.clone_Index(index);
+}
+
+/**********************************************************
+ * Cloning to multiple GPUs
+ **********************************************************/
+
+ToGpuClonerMultiple::ToGpuClonerMultiple(
+        std::vector<GpuResourcesProvider*>& provider,
+        std::vector<int>& devices,
+        const GpuMultipleClonerOptions& options)
+        : GpuMultipleClonerOptions(options) {
+    FAISS_ASSERT(provider.size() == devices.size());
+    for (int i = 0; i < provider.size(); i++) {
+        sub_cloners.push_back(ToGpuCloner(provider[i], devices[i], options));
+    }
+}
+
+ToGpuClonerMultiple::ToGpuClonerMultiple(
+        const std::vector<ToGpuCloner>& sub_cloners,
+        const GpuMultipleClonerOptions& options)
+        : GpuMultipleClonerOptions(options), sub_cloners(sub_cloners) {}
+
+void ToGpuClonerMultiple::copy_ivf_shard(
+        const IndexIVF* index_ivf,
+        IndexIVF* idx2,
+        long n,
+        long i) {
+    if (shard_type == 2) {
+        long i0 = i * index_ivf->ntotal / n;
+        long i1 = (i + 1) * index_ivf->ntotal / n;
+
+        if (verbose)
+            printf("IndexShards shard %ld indices %ld:%ld\n", i, i0, i1);
+        index_ivf->copy_subset_to(*idx2, 2, i0, i1);
+        FAISS_ASSERT(idx2->ntotal == i1 - i0);
+    } else if (shard_type == 1) {
+        if (verbose)
+            printf("IndexShards shard %ld select modulo %ld = %ld\n", i, n, i);
+        index_ivf->copy_subset_to(*idx2, 1, n, i);
+    } else {
+        FAISS_THROW_FMT("shard_type %d not implemented", shard_type);
+    }
+}
+
+Index* ToGpuClonerMultiple::clone_Index_to_shards(const Index* index) {
+    long n = sub_cloners.size();
+
+    auto index_ivfpq = dynamic_cast<const faiss::IndexIVFPQ*>(index);
+    auto index_ivfflat = dynamic_cast<const faiss::IndexIVFFlat*>(index);
+    auto index_ivfsq =
+            dynamic_cast<const faiss::IndexIVFScalarQuantizer*>(index);
+    auto index_flat = dynamic_cast<const faiss::IndexFlat*>(index);
+    FAISS_THROW_IF_NOT_MSG(
+            index_ivfpq || index_ivfflat || index_flat || index_ivfsq,
+            "IndexShards implemented only for "
+            "IndexIVFFlat, IndexIVFScalarQuantizer, "
+            "IndexFlat and IndexIVFPQ");
+
+    std::vector<faiss::Index*> shards(n);
+
+    for (long i = 0; i < n; i++) {
+        // make a shallow copy
+        if (reserveVecs)
+            sub_cloners[i].reserveVecs = (reserveVecs + n - 1) / n;
+
+        if (index_ivfpq) {
+            faiss::IndexIVFPQ idx2(
+                    index_ivfpq->quantizer,
+                    index_ivfpq->d,
+                    index_ivfpq->nlist,
+                    index_ivfpq->code_size,
+                    index_ivfpq->pq.nbits);
+            idx2.metric_type = index_ivfpq->metric_type;
+            idx2.pq = index_ivfpq->pq;
+            idx2.nprobe = index_ivfpq->nprobe;
+            idx2.use_precomputed_table = 0;
+            idx2.is_trained = index->is_trained;
+            copy_ivf_shard(index_ivfpq, &idx2, n, i);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+        } else if (index_ivfflat) {
+            faiss::IndexIVFFlat idx2(
+                    index_ivfflat->quantizer,
+                    index->d,
+                    index_ivfflat->nlist,
+                    index_ivfflat->metric_type);
+            idx2.nprobe = index_ivfflat->nprobe;
+            idx2.is_trained = index->is_trained;
+            copy_ivf_shard(index_ivfflat, &idx2, n, i);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+        } else if (index_ivfsq) {
+            faiss::IndexIVFScalarQuantizer idx2(
+                    index_ivfsq->quantizer,
+                    index->d,
+                    index_ivfsq->nlist,
+                    index_ivfsq->sq.qtype,
+                    index_ivfsq->metric_type,
+                    index_ivfsq->by_residual);
+
+            idx2.nprobe = index_ivfsq->nprobe;
+            idx2.is_trained = index->is_trained;
+            idx2.sq = index_ivfsq->sq;
+            copy_ivf_shard(index_ivfsq, &idx2, n, i);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+        } else if (index_flat) {
+            faiss::IndexFlat idx2(index->d, index->metric_type);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+            if (index->ntotal > 0) {
+                long i0 = index->ntotal * i / n;
+                long i1 = index->ntotal * (i + 1) / n;
+                shards[i]->add(i1 - i0, index_flat->get_xb() + i0 * index->d);
+            }
+        }
+    }
+
+    bool successive_ids = index_flat != nullptr;
+    faiss::IndexShards* res =
+            new faiss::IndexShards(index->d, true, successive_ids);
+
+    for (int i = 0; i < n; i++) {
+        res->add_shard(shards[i]);
+    }
+    res->own_fields = true;
+    FAISS_ASSERT(index->ntotal == res->ntotal);
+    return res;
+}
+
+Index* ToGpuClonerMultiple::clone_Index(const Index* index) {
+    long n = sub_cloners.size();
+    if (n == 1)
+        return sub_cloners[0].clone_Index(index);
+
+    if (dynamic_cast<const IndexFlat*>(index) ||
+        dynamic_cast<const faiss::IndexIVFFlat*>(index) ||
+        dynamic_cast<const faiss::IndexIVFScalarQuantizer*>(index) ||
+        dynamic_cast<const faiss::IndexIVFPQ*>(index)) {
+        if (!shard) {
+            IndexReplicas* res = new IndexReplicas();
+            for (auto& sub_cloner : sub_cloners) {
+                res->addIndex(sub_cloner.clone_Index(index));
+            }
+            res->own_fields = true;
+            return res;
+        } else {
+            return clone_Index_to_shards(index);
+        }
+    } else if (auto miq = dynamic_cast<const MultiIndexQuantizer*>(index)) {
+        if (verbose) {
+            printf("cloning MultiIndexQuantizer: "
+                   "will be valid only for search k=1\n");
+        }
+        const ProductQuantizer& pq = miq->pq;
+        IndexSplitVectors* splitv = new IndexSplitVectors(pq.d, true);
+        splitv->own_fields = true;
+
+        for (int m = 0; m < pq.M; m++) {
+            // which GPU(s) will be assigned to this sub-quantizer
+
+            long i0 = m * n / pq.M;
+            long i1 = pq.M <= n ? (m + 1) * n / pq.M : i0 + 1;
+            std::vector<ToGpuCloner> sub_cloners_2;
+            sub_cloners_2.insert(
+                    sub_cloners_2.begin(),
+                    sub_cloners.begin() + i0,
+                    sub_cloners.begin() + i1);
+            ToGpuClonerMultiple cm(sub_cloners_2, *this);
+            IndexFlatL2 idxc(pq.dsub);
+            idxc.add(pq.ksub, pq.centroids.data() + m * pq.d * pq.ksub);
+            Index* idx2 = cm.clone_Index(&idxc);
+            splitv->add_sub_index(idx2);
+        }
+        return splitv;
+    } else {
+        return Cloner::clone_Index(index);
+    }
+}
+
+faiss::Index* index_cpu_to_gpu_multiple(
+        std::vector<GpuResourcesProvider*>& provider,
+        std::vector<int>& devices,
+        const faiss::Index* index,
+        const GpuMultipleClonerOptions* options) {
+    GpuMultipleClonerOptions defaults;
+    ToGpuClonerMultiple cl(provider, devices, options ? *options : defaults);
+    return cl.clone_Index(index);
+}
+
+GpuProgressiveDimIndexFactory::GpuProgressiveDimIndexFactory(int ngpu) {
+    FAISS_THROW_IF_NOT(ngpu >= 1);
+    devices.resize(ngpu);
+    vres.resize(ngpu);
+
+    for (int i = 0; i < ngpu; i++) {
+        vres[i] = new StandardGpuResources();
+        devices[i] = i;
+    }
+    ncall = 0;
+}
+
+GpuProgressiveDimIndexFactory::~GpuProgressiveDimIndexFactory() {
+    for (int i = 0; i < vres.size(); i++) {
+        delete vres[i];
+    }
+}
+
+Index* GpuProgressiveDimIndexFactory::operator()(int dim) {
+    IndexFlatL2 index(dim);
+    ncall++;
+    return index_cpu_to_gpu_multiple(vres, devices, &index, &options);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuCloner.cpp.prehip
+++ b/faiss/gpu/GpuCloner.cpp.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuCloner.h>
+#include <faiss/impl/FaissAssert.h>
+#include <typeinfo>
+
+#include <faiss/gpu/StandardGpuResources.h>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexPreTransform.h>
+#include <faiss/IndexReplicas.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/MetaIndexes.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/index_io.h>
+
+namespace faiss {
+namespace gpu {
+
+/**********************************************************
+ * Cloning to CPU
+ **********************************************************/
+
+void ToCPUCloner::merge_index(Index* dst, Index* src, bool successive_ids) {
+    if (auto ifl = dynamic_cast<IndexFlat*>(dst)) {
+        auto ifl2 = dynamic_cast<const IndexFlat*>(src);
+        FAISS_ASSERT(ifl2);
+        FAISS_ASSERT(successive_ids);
+        ifl->add(ifl2->ntotal, ifl2->get_xb());
+    } else if (auto ifl = dynamic_cast<IndexIVFFlat*>(dst)) {
+        auto ifl2 = dynamic_cast<IndexIVFFlat*>(src);
+        FAISS_ASSERT(ifl2);
+        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
+    } else if (auto ifl = dynamic_cast<IndexIVFScalarQuantizer*>(dst)) {
+        auto ifl2 = dynamic_cast<IndexIVFScalarQuantizer*>(src);
+        FAISS_ASSERT(ifl2);
+        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
+    } else if (auto ifl = dynamic_cast<IndexIVFPQ*>(dst)) {
+        auto ifl2 = dynamic_cast<IndexIVFPQ*>(src);
+        FAISS_ASSERT(ifl2);
+        ifl->merge_from(*ifl2, successive_ids ? ifl->ntotal : 0);
+    } else {
+        FAISS_ASSERT(!"merging not implemented for this type of class");
+    }
+}
+
+Index* ToCPUCloner::clone_Index(const Index* index) {
+    if (auto ifl = dynamic_cast<const GpuIndexFlat*>(index)) {
+        IndexFlat* res = new IndexFlat();
+        ifl->copyTo(res);
+        return res;
+    } else if (auto ifl = dynamic_cast<const GpuIndexIVFFlat*>(index)) {
+        IndexIVFFlat* res = new IndexIVFFlat();
+        ifl->copyTo(res);
+        return res;
+    } else if (
+            auto ifl = dynamic_cast<const GpuIndexIVFScalarQuantizer*>(index)) {
+        IndexIVFScalarQuantizer* res = new IndexIVFScalarQuantizer();
+        ifl->copyTo(res);
+        return res;
+    } else if (auto ipq = dynamic_cast<const GpuIndexIVFPQ*>(index)) {
+        IndexIVFPQ* res = new IndexIVFPQ();
+        ipq->copyTo(res);
+        return res;
+
+        // for IndexShards and IndexReplicas we assume that the
+        // objective is to make a single component out of them
+        // (inverse op of ToGpuClonerMultiple)
+
+    } else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
+        int nshard = ish->count();
+        FAISS_ASSERT(nshard > 0);
+        Index* res = clone_Index(ish->at(0));
+        for (int i = 1; i < ish->count(); i++) {
+            Index* res_i = clone_Index(ish->at(i));
+            merge_index(res, res_i, ish->successive_ids);
+            delete res_i;
+        }
+        return res;
+    } else if (auto ipr = dynamic_cast<const IndexReplicas*>(index)) {
+        // just clone one of the replicas
+        FAISS_ASSERT(ipr->count() > 0);
+        return clone_Index(ipr->at(0));
+    } else {
+        return Cloner::clone_Index(index);
+    }
+}
+
+faiss::Index* index_gpu_to_cpu(const faiss::Index* gpu_index) {
+    ToCPUCloner cl;
+    return cl.clone_Index(gpu_index);
+}
+
+/**********************************************************
+ * Cloning to 1 GPU
+ **********************************************************/
+
+ToGpuCloner::ToGpuCloner(
+        GpuResourcesProvider* prov,
+        int device,
+        const GpuClonerOptions& options)
+        : GpuClonerOptions(options), provider(prov), device(device) {}
+
+Index* ToGpuCloner::clone_Index(const Index* index) {
+    using idx_t = Index::idx_t;
+    if (auto ifl = dynamic_cast<const IndexFlat*>(index)) {
+        GpuIndexFlatConfig config;
+        config.device = device;
+        config.useFloat16 = useFloat16;
+        config.storeTransposed = storeTransposed;
+        return new GpuIndexFlat(provider, ifl, config);
+    } else if (
+            dynamic_cast<const IndexScalarQuantizer*>(index) &&
+            static_cast<const IndexScalarQuantizer*>(index)->sq.qtype ==
+                    ScalarQuantizer::QT_fp16) {
+        GpuIndexFlatConfig config;
+        config.device = device;
+        config.useFloat16 = true;
+        GpuIndexFlat* gif = new GpuIndexFlat(
+                provider, index->d, index->metric_type, config);
+        // transfer data by blocks
+        idx_t bs = 1024 * 1024;
+        for (idx_t i0 = 0; i0 < index->ntotal; i0 += bs) {
+            idx_t i1 = std::min(i0 + bs, index->ntotal);
+            std::vector<float> buffer((i1 - i0) * index->d);
+            index->reconstruct_n(i0, i1 - i0, buffer.data());
+            gif->add(i1 - i0, buffer.data());
+        }
+        assert(gif->getNumVecs() == index->ntotal);
+        return gif;
+    } else if (auto ifl = dynamic_cast<const faiss::IndexIVFFlat*>(index)) {
+        GpuIndexIVFFlatConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+
+        GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
+                provider, ifl->d, ifl->nlist, ifl->metric_type, config);
+        if (reserveVecs > 0 && ifl->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        res->copyFrom(ifl);
+        return res;
+    } else if (
+            auto ifl = dynamic_cast<const faiss::IndexIVFScalarQuantizer*>(
+                    index)) {
+        GpuIndexIVFScalarQuantizerConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+
+        GpuIndexIVFScalarQuantizer* res = new GpuIndexIVFScalarQuantizer(
+                provider,
+                ifl->d,
+                ifl->nlist,
+                ifl->sq.qtype,
+                ifl->metric_type,
+                ifl->by_residual,
+                config);
+        if (reserveVecs > 0 && ifl->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        res->copyFrom(ifl);
+        return res;
+    } else if (auto ipq = dynamic_cast<const faiss::IndexIVFPQ*>(index)) {
+        if (verbose) {
+            printf("  IndexIVFPQ size %ld -> GpuIndexIVFPQ "
+                   "indicesOptions=%d "
+                   "usePrecomputed=%d useFloat16=%d reserveVecs=%ld\n",
+                   ipq->ntotal,
+                   indicesOptions,
+                   usePrecomputed,
+                   useFloat16,
+                   reserveVecs);
+        }
+        GpuIndexIVFPQConfig config;
+        config.device = device;
+        config.indicesOptions = indicesOptions;
+        config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
+        config.flatConfig.storeTransposed = storeTransposed;
+        config.useFloat16LookupTables = useFloat16;
+        config.usePrecomputedTables = usePrecomputed;
+
+        GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
+
+        if (reserveVecs > 0 && ipq->ntotal == 0) {
+            res->reserveMemory(reserveVecs);
+        }
+
+        return res;
+    } else {
+        // default: use CPU cloner
+        return Cloner::clone_Index(index);
+    }
+}
+
+faiss::Index* index_cpu_to_gpu(
+        GpuResourcesProvider* provider,
+        int device,
+        const faiss::Index* index,
+        const GpuClonerOptions* options) {
+    GpuClonerOptions defaults;
+    ToGpuCloner cl(provider, device, options ? *options : defaults);
+    return cl.clone_Index(index);
+}
+
+/**********************************************************
+ * Cloning to multiple GPUs
+ **********************************************************/
+
+ToGpuClonerMultiple::ToGpuClonerMultiple(
+        std::vector<GpuResourcesProvider*>& provider,
+        std::vector<int>& devices,
+        const GpuMultipleClonerOptions& options)
+        : GpuMultipleClonerOptions(options) {
+    FAISS_ASSERT(provider.size() == devices.size());
+    for (int i = 0; i < provider.size(); i++) {
+        sub_cloners.push_back(ToGpuCloner(provider[i], devices[i], options));
+    }
+}
+
+ToGpuClonerMultiple::ToGpuClonerMultiple(
+        const std::vector<ToGpuCloner>& sub_cloners,
+        const GpuMultipleClonerOptions& options)
+        : GpuMultipleClonerOptions(options), sub_cloners(sub_cloners) {}
+
+void ToGpuClonerMultiple::copy_ivf_shard(
+        const IndexIVF* index_ivf,
+        IndexIVF* idx2,
+        long n,
+        long i) {
+    if (shard_type == 2) {
+        long i0 = i * index_ivf->ntotal / n;
+        long i1 = (i + 1) * index_ivf->ntotal / n;
+
+        if (verbose)
+            printf("IndexShards shard %ld indices %ld:%ld\n", i, i0, i1);
+        index_ivf->copy_subset_to(*idx2, 2, i0, i1);
+        FAISS_ASSERT(idx2->ntotal == i1 - i0);
+    } else if (shard_type == 1) {
+        if (verbose)
+            printf("IndexShards shard %ld select modulo %ld = %ld\n", i, n, i);
+        index_ivf->copy_subset_to(*idx2, 1, n, i);
+    } else {
+        FAISS_THROW_FMT("shard_type %d not implemented", shard_type);
+    }
+}
+
+Index* ToGpuClonerMultiple::clone_Index_to_shards(const Index* index) {
+    long n = sub_cloners.size();
+
+    auto index_ivfpq = dynamic_cast<const faiss::IndexIVFPQ*>(index);
+    auto index_ivfflat = dynamic_cast<const faiss::IndexIVFFlat*>(index);
+    auto index_ivfsq =
+            dynamic_cast<const faiss::IndexIVFScalarQuantizer*>(index);
+    auto index_flat = dynamic_cast<const faiss::IndexFlat*>(index);
+    FAISS_THROW_IF_NOT_MSG(
+            index_ivfpq || index_ivfflat || index_flat || index_ivfsq,
+            "IndexShards implemented only for "
+            "IndexIVFFlat, IndexIVFScalarQuantizer, "
+            "IndexFlat and IndexIVFPQ");
+
+    std::vector<faiss::Index*> shards(n);
+
+    for (long i = 0; i < n; i++) {
+        // make a shallow copy
+        if (reserveVecs)
+            sub_cloners[i].reserveVecs = (reserveVecs + n - 1) / n;
+
+        if (index_ivfpq) {
+            faiss::IndexIVFPQ idx2(
+                    index_ivfpq->quantizer,
+                    index_ivfpq->d,
+                    index_ivfpq->nlist,
+                    index_ivfpq->code_size,
+                    index_ivfpq->pq.nbits);
+            idx2.metric_type = index_ivfpq->metric_type;
+            idx2.pq = index_ivfpq->pq;
+            idx2.nprobe = index_ivfpq->nprobe;
+            idx2.use_precomputed_table = 0;
+            idx2.is_trained = index->is_trained;
+            copy_ivf_shard(index_ivfpq, &idx2, n, i);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+        } else if (index_ivfflat) {
+            faiss::IndexIVFFlat idx2(
+                    index_ivfflat->quantizer,
+                    index->d,
+                    index_ivfflat->nlist,
+                    index_ivfflat->metric_type);
+            idx2.nprobe = index_ivfflat->nprobe;
+            idx2.is_trained = index->is_trained;
+            copy_ivf_shard(index_ivfflat, &idx2, n, i);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+        } else if (index_ivfsq) {
+            faiss::IndexIVFScalarQuantizer idx2(
+                    index_ivfsq->quantizer,
+                    index->d,
+                    index_ivfsq->nlist,
+                    index_ivfsq->sq.qtype,
+                    index_ivfsq->metric_type,
+                    index_ivfsq->by_residual);
+
+            idx2.nprobe = index_ivfsq->nprobe;
+            idx2.is_trained = index->is_trained;
+            idx2.sq = index_ivfsq->sq;
+            copy_ivf_shard(index_ivfsq, &idx2, n, i);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+        } else if (index_flat) {
+            faiss::IndexFlat idx2(index->d, index->metric_type);
+            shards[i] = sub_cloners[i].clone_Index(&idx2);
+            if (index->ntotal > 0) {
+                long i0 = index->ntotal * i / n;
+                long i1 = index->ntotal * (i + 1) / n;
+                shards[i]->add(i1 - i0, index_flat->get_xb() + i0 * index->d);
+            }
+        }
+    }
+
+    bool successive_ids = index_flat != nullptr;
+    faiss::IndexShards* res =
+            new faiss::IndexShards(index->d, true, successive_ids);
+
+    for (int i = 0; i < n; i++) {
+        res->add_shard(shards[i]);
+    }
+    res->own_fields = true;
+    FAISS_ASSERT(index->ntotal == res->ntotal);
+    return res;
+}
+
+Index* ToGpuClonerMultiple::clone_Index(const Index* index) {
+    long n = sub_cloners.size();
+    if (n == 1)
+        return sub_cloners[0].clone_Index(index);
+
+    if (dynamic_cast<const IndexFlat*>(index) ||
+        dynamic_cast<const faiss::IndexIVFFlat*>(index) ||
+        dynamic_cast<const faiss::IndexIVFScalarQuantizer*>(index) ||
+        dynamic_cast<const faiss::IndexIVFPQ*>(index)) {
+        if (!shard) {
+            IndexReplicas* res = new IndexReplicas();
+            for (auto& sub_cloner : sub_cloners) {
+                res->addIndex(sub_cloner.clone_Index(index));
+            }
+            res->own_fields = true;
+            return res;
+        } else {
+            return clone_Index_to_shards(index);
+        }
+    } else if (auto miq = dynamic_cast<const MultiIndexQuantizer*>(index)) {
+        if (verbose) {
+            printf("cloning MultiIndexQuantizer: "
+                   "will be valid only for search k=1\n");
+        }
+        const ProductQuantizer& pq = miq->pq;
+        IndexSplitVectors* splitv = new IndexSplitVectors(pq.d, true);
+        splitv->own_fields = true;
+
+        for (int m = 0; m < pq.M; m++) {
+            // which GPU(s) will be assigned to this sub-quantizer
+
+            long i0 = m * n / pq.M;
+            long i1 = pq.M <= n ? (m + 1) * n / pq.M : i0 + 1;
+            std::vector<ToGpuCloner> sub_cloners_2;
+            sub_cloners_2.insert(
+                    sub_cloners_2.begin(),
+                    sub_cloners.begin() + i0,
+                    sub_cloners.begin() + i1);
+            ToGpuClonerMultiple cm(sub_cloners_2, *this);
+            IndexFlatL2 idxc(pq.dsub);
+            idxc.add(pq.ksub, pq.centroids.data() + m * pq.d * pq.ksub);
+            Index* idx2 = cm.clone_Index(&idxc);
+            splitv->add_sub_index(idx2);
+        }
+        return splitv;
+    } else {
+        return Cloner::clone_Index(index);
+    }
+}
+
+faiss::Index* index_cpu_to_gpu_multiple(
+        std::vector<GpuResourcesProvider*>& provider,
+        std::vector<int>& devices,
+        const faiss::Index* index,
+        const GpuMultipleClonerOptions* options) {
+    GpuMultipleClonerOptions defaults;
+    ToGpuClonerMultiple cl(provider, devices, options ? *options : defaults);
+    return cl.clone_Index(index);
+}
+
+GpuProgressiveDimIndexFactory::GpuProgressiveDimIndexFactory(int ngpu) {
+    FAISS_THROW_IF_NOT(ngpu >= 1);
+    devices.resize(ngpu);
+    vres.resize(ngpu);
+
+    for (int i = 0; i < ngpu; i++) {
+        vres[i] = new StandardGpuResources();
+        devices[i] = i;
+    }
+    ncall = 0;
+}
+
+GpuProgressiveDimIndexFactory::~GpuProgressiveDimIndexFactory() {
+    for (int i = 0; i < vres.size(); i++) {
+        delete vres[i];
+    }
+}
+
+Index* GpuProgressiveDimIndexFactory::operator()(int dim) {
+    IndexFlatL2 index(dim);
+    ncall++;
+    return index_cpu_to_gpu_multiple(vres, devices, &index, &options);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuCloner.h
+++ b/faiss/gpu/GpuCloner.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/Clustering.h>
+#include <faiss/Index.h>
+#include <faiss/clone_index.h>
+#include <faiss/gpu/GpuClonerOptions.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndicesOptions.h>
+namespace faiss {
+namespace gpu {
+
+class GpuResourcesProvider;
+
+/// Cloner specialized for GPU -> CPU
+struct ToCPUCloner : faiss::Cloner {
+    void merge_index(Index* dst, Index* src, bool successive_ids);
+    Index* clone_Index(const Index* index) override;
+};
+
+/// Cloner specialized for CPU -> 1 GPU
+struct ToGpuCloner : faiss::Cloner, GpuClonerOptions {
+    GpuResourcesProvider* provider;
+    int device;
+
+    ToGpuCloner(
+            GpuResourcesProvider* prov,
+            int device,
+            const GpuClonerOptions& options);
+
+    Index* clone_Index(const Index* index) override;
+};
+
+/// Cloner specialized for CPU -> multiple GPUs
+struct ToGpuClonerMultiple : faiss::Cloner, GpuMultipleClonerOptions {
+    std::vector<ToGpuCloner> sub_cloners;
+
+    ToGpuClonerMultiple(
+            std::vector<GpuResourcesProvider*>& provider,
+            std::vector<int>& devices,
+            const GpuMultipleClonerOptions& options);
+
+    ToGpuClonerMultiple(
+            const std::vector<ToGpuCloner>& sub_cloners,
+            const GpuMultipleClonerOptions& options);
+
+    void copy_ivf_shard(
+            const IndexIVF* index_ivf,
+            IndexIVF* idx2,
+            long n,
+            long i);
+
+    Index* clone_Index_to_shards(const Index* index);
+
+    /// main function
+    Index* clone_Index(const Index* index) override;
+};
+
+/// converts any GPU index inside gpu_index to a CPU index
+faiss::Index* index_gpu_to_cpu(const faiss::Index* gpu_index);
+
+/// converts any CPU index that can be converted to GPU
+faiss::Index* index_cpu_to_gpu(
+        GpuResourcesProvider* provider,
+        int device,
+        const faiss::Index* index,
+        const GpuClonerOptions* options = nullptr);
+
+faiss::Index* index_cpu_to_gpu_multiple(
+        std::vector<GpuResourcesProvider*>& provider,
+        std::vector<int>& devices,
+        const faiss::Index* index,
+        const GpuMultipleClonerOptions* options = nullptr);
+
+/// index factory for the ProgressiveDimClustering object
+
+struct GpuProgressiveDimIndexFactory : ProgressiveDimIndexFactory {
+    GpuMultipleClonerOptions options;
+    std::vector<GpuResourcesProvider*> vres;
+    std::vector<int> devices;
+    int ncall;
+
+    explicit GpuProgressiveDimIndexFactory(int ngpu);
+
+    Index* operator()(int dim) override;
+
+    virtual ~GpuProgressiveDimIndexFactory() override;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuCloner.h.prehip
+++ b/faiss/gpu/GpuCloner.h.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include <faiss/Clustering.h>
+#include <faiss/Index.h>
+#include <faiss/clone_index.h>
+#include <faiss/gpu/GpuClonerOptions.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndicesOptions.h>
+namespace faiss {
+namespace gpu {
+
+class GpuResourcesProvider;
+
+/// Cloner specialized for GPU -> CPU
+struct ToCPUCloner : faiss::Cloner {
+    void merge_index(Index* dst, Index* src, bool successive_ids);
+    Index* clone_Index(const Index* index) override;
+};
+
+/// Cloner specialized for CPU -> 1 GPU
+struct ToGpuCloner : faiss::Cloner, GpuClonerOptions {
+    GpuResourcesProvider* provider;
+    int device;
+
+    ToGpuCloner(
+            GpuResourcesProvider* prov,
+            int device,
+            const GpuClonerOptions& options);
+
+    Index* clone_Index(const Index* index) override;
+};
+
+/// Cloner specialized for CPU -> multiple GPUs
+struct ToGpuClonerMultiple : faiss::Cloner, GpuMultipleClonerOptions {
+    std::vector<ToGpuCloner> sub_cloners;
+
+    ToGpuClonerMultiple(
+            std::vector<GpuResourcesProvider*>& provider,
+            std::vector<int>& devices,
+            const GpuMultipleClonerOptions& options);
+
+    ToGpuClonerMultiple(
+            const std::vector<ToGpuCloner>& sub_cloners,
+            const GpuMultipleClonerOptions& options);
+
+    void copy_ivf_shard(
+            const IndexIVF* index_ivf,
+            IndexIVF* idx2,
+            long n,
+            long i);
+
+    Index* clone_Index_to_shards(const Index* index);
+
+    /// main function
+    Index* clone_Index(const Index* index) override;
+};
+
+/// converts any GPU index inside gpu_index to a CPU index
+faiss::Index* index_gpu_to_cpu(const faiss::Index* gpu_index);
+
+/// converts any CPU index that can be converted to GPU
+faiss::Index* index_cpu_to_gpu(
+        GpuResourcesProvider* provider,
+        int device,
+        const faiss::Index* index,
+        const GpuClonerOptions* options = nullptr);
+
+faiss::Index* index_cpu_to_gpu_multiple(
+        std::vector<GpuResourcesProvider*>& provider,
+        std::vector<int>& devices,
+        const faiss::Index* index,
+        const GpuMultipleClonerOptions* options = nullptr);
+
+/// index factory for the ProgressiveDimClustering object
+
+struct GpuProgressiveDimIndexFactory : ProgressiveDimIndexFactory {
+    GpuMultipleClonerOptions options;
+    std::vector<GpuResourcesProvider*> vres;
+    std::vector<int> devices;
+    int ncall;
+
+    explicit GpuProgressiveDimIndexFactory(int ngpu);
+
+    Index* operator()(int dim) override;
+
+    virtual ~GpuProgressiveDimIndexFactory() override;
+};
+
+} // namespace gpu
+} // namespace faiss