init the faiss for rocm

395d2ce6 · huchen · 5ded39f5 · 395d2ce6 · 395d2ce6 · 395d2ce6
Commit 395d2ce6 authored Jun 01, 2022 by huchen
10 changed files
--- a/faiss/gpu/GpuIndexBinaryFlat.h
+++ b/faiss/gpu/GpuIndexBinaryFlat.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuResources.h>
+#include <memory>
+
+namespace faiss {
+namespace gpu {
+
+class BinaryFlatIndex;
+
+struct GpuIndexBinaryFlatConfig : public GpuIndexConfig {};
+
+/// A GPU version of IndexBinaryFlat for brute-force comparison of bit vectors
+/// via Hamming distance
+class GpuIndexBinaryFlat : public IndexBinary {
+   public:
+    /// Construct from a pre-existing faiss::IndexBinaryFlat instance, copying
+    /// data over to the given GPU
+    GpuIndexBinaryFlat(
+            GpuResourcesProvider* resources,
+            const faiss::IndexBinaryFlat* index,
+            GpuIndexBinaryFlatConfig config = GpuIndexBinaryFlatConfig());
+
+    /// Construct an empty instance that can be added to
+    GpuIndexBinaryFlat(
+            GpuResourcesProvider* resources,
+            int dims,
+            GpuIndexBinaryFlatConfig config = GpuIndexBinaryFlatConfig());
+
+    ~GpuIndexBinaryFlat() override;
+
+    /// Returns the device that this index is resident on
+    int getDevice() const;
+
+    /// Returns a reference to our GpuResources object that manages memory,
+    /// stream and handle resources on the GPU
+    std::shared_ptr<GpuResources> getResources();
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(const faiss::IndexBinaryFlat* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexBinaryFlat* index) const;
+
+    void add(faiss::IndexBinary::idx_t n, const uint8_t* x) override;
+
+    void reset() override;
+
+    void search(
+            faiss::IndexBinary::idx_t n,
+            const uint8_t* x,
+            faiss::IndexBinary::idx_t k,
+            int32_t* distances,
+            faiss::IndexBinary::idx_t* labels) const override;
+
+    void reconstruct(faiss::IndexBinary::idx_t key, uint8_t* recons)
+            const override;
+
+   protected:
+    /// Called from search when the input data is on the CPU;
+    /// potentially allows for pinned memory usage
+    void searchFromCpuPaged_(
+            int n,
+            const uint8_t* x,
+            int k,
+            int32_t* outDistancesData,
+            int* outIndicesData) const;
+
+    void searchNonPaged_(
+            int n,
+            const uint8_t* x,
+            int k,
+            int32_t* outDistancesData,
+            int* outIndicesData) const;
+
+   protected:
+    /// Manages streans, cuBLAS handles and scratch memory for devices
+    std::shared_ptr<GpuResources> resources_;
+
+    /// Configuration options
+    const GpuIndexBinaryFlatConfig binaryFlatConfig_;
+
+    /// Holds our GPU data containing the list of vectors
+    std::unique_ptr<BinaryFlatIndex> data_;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndexBinaryFlat.h.prehip
+++ b/faiss/gpu/GpuIndexBinaryFlat.h.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuResources.h>
+#include <memory>
+
+namespace faiss {
+namespace gpu {
+
+class BinaryFlatIndex;
+
+struct GpuIndexBinaryFlatConfig : public GpuIndexConfig {};
+
+/// A GPU version of IndexBinaryFlat for brute-force comparison of bit vectors
+/// via Hamming distance
+class GpuIndexBinaryFlat : public IndexBinary {
+   public:
+    /// Construct from a pre-existing faiss::IndexBinaryFlat instance, copying
+    /// data over to the given GPU
+    GpuIndexBinaryFlat(
+            GpuResourcesProvider* resources,
+            const faiss::IndexBinaryFlat* index,
+            GpuIndexBinaryFlatConfig config = GpuIndexBinaryFlatConfig());
+
+    /// Construct an empty instance that can be added to
+    GpuIndexBinaryFlat(
+            GpuResourcesProvider* resources,
+            int dims,
+            GpuIndexBinaryFlatConfig config = GpuIndexBinaryFlatConfig());
+
+    ~GpuIndexBinaryFlat() override;
+
+    /// Returns the device that this index is resident on
+    int getDevice() const;
+
+    /// Returns a reference to our GpuResources object that manages memory,
+    /// stream and handle resources on the GPU
+    std::shared_ptr<GpuResources> getResources();
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(const faiss::IndexBinaryFlat* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexBinaryFlat* index) const;
+
+    void add(faiss::IndexBinary::idx_t n, const uint8_t* x) override;
+
+    void reset() override;
+
+    void search(
+            faiss::IndexBinary::idx_t n,
+            const uint8_t* x,
+            faiss::IndexBinary::idx_t k,
+            int32_t* distances,
+            faiss::IndexBinary::idx_t* labels) const override;
+
+    void reconstruct(faiss::IndexBinary::idx_t key, uint8_t* recons)
+            const override;
+
+   protected:
+    /// Called from search when the input data is on the CPU;
+    /// potentially allows for pinned memory usage
+    void searchFromCpuPaged_(
+            int n,
+            const uint8_t* x,
+            int k,
+            int32_t* outDistancesData,
+            int* outIndicesData) const;
+
+    void searchNonPaged_(
+            int n,
+            const uint8_t* x,
+            int k,
+            int32_t* outDistancesData,
+            int* outIndicesData) const;
+
+   protected:
+    /// Manages streans, cuBLAS handles and scratch memory for devices
+    std::shared_ptr<GpuResources> resources_;
+
+    /// Configuration options
+    const GpuIndexBinaryFlatConfig binaryFlatConfig_;
+
+    /// Holds our GPU data containing the list of vectors
+    std::unique_ptr<BinaryFlatIndex> data_;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndexFlat.cu
+++ b/faiss/gpu/GpuIndexFlat.cu
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexFlat.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <limits>
+
+namespace faiss {
+namespace gpu {
+
+GpuIndexFlat::GpuIndexFlat(
+        GpuResourcesProvider* provider,
+        const faiss::IndexFlat* index,
+        GpuIndexFlatConfig config)
+        : GpuIndex(
+                  provider->getResources(),
+                  index->d,
+                  index->metric_type,
+                  index->metric_arg,
+                  config),
+          flatConfig_(config) {
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    copyFrom(index);
+}
+
+GpuIndexFlat::GpuIndexFlat(
+        std::shared_ptr<GpuResources> resources,
+        const faiss::IndexFlat* index,
+        GpuIndexFlatConfig config)
+        : GpuIndex(
+                  resources,
+                  index->d,
+                  index->metric_type,
+                  index->metric_arg,
+                  config),
+          flatConfig_(config) {
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    copyFrom(index);
+}
+
+GpuIndexFlat::GpuIndexFlat(
+        GpuResourcesProvider* provider,
+        int dims,
+        faiss::MetricType metric,
+        GpuIndexFlatConfig config)
+        : GpuIndex(provider->getResources(), dims, metric, 0, config),
+          flatConfig_(config) {
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    // Construct index
+    DeviceScope scope(config_.device);
+    data_.reset(new FlatIndex(
+            resources_.get(),
+            dims,
+            flatConfig_.useFloat16,
+            flatConfig_.storeTransposed,
+            config_.memorySpace));
+}
+
+GpuIndexFlat::GpuIndexFlat(
+        std::shared_ptr<GpuResources> resources,
+        int dims,
+        faiss::MetricType metric,
+        GpuIndexFlatConfig config)
+        : GpuIndex(resources, dims, metric, 0, config), flatConfig_(config) {
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    // Construct index
+    DeviceScope scope(config_.device);
+    data_.reset(new FlatIndex(
+            resources_.get(),
+            dims,
+            flatConfig_.useFloat16,
+            flatConfig_.storeTransposed,
+            config_.memorySpace));
+}
+
+GpuIndexFlat::~GpuIndexFlat() {}
+
+void GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
+    DeviceScope scope(config_.device);
+
+    GpuIndex::copyFrom(index);
+
+    // GPU code has 32 bit indices
+    FAISS_THROW_IF_NOT_FMT(
+            index->ntotal <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices; "
+            "attempting to copy CPU index with %zu parameters",
+            (size_t)std::numeric_limits<int>::max(),
+            (size_t)index->ntotal);
+
+    data_.reset();
+    data_.reset(new FlatIndex(
+            resources_.get(),
+            this->d,
+            flatConfig_.useFloat16,
+            flatConfig_.storeTransposed,
+            config_.memorySpace));
+
+    // The index could be empty
+    if (index->ntotal > 0) {
+        data_->add(
+                index->get_xb(),
+                index->ntotal,
+                resources_->getDefaultStream(config_.device));
+    }
+}
+
+void GpuIndexFlat::copyTo(faiss::IndexFlat* index) const {
+    DeviceScope scope(config_.device);
+
+    GpuIndex::copyTo(index);
+    index->code_size = sizeof(float) * this->d;
+
+    FAISS_ASSERT(data_);
+    FAISS_ASSERT(data_->getSize() == this->ntotal);
+    index->codes.resize(this->ntotal * index->code_size);
+
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    if (this->ntotal > 0) {
+        if (flatConfig_.useFloat16) {
+            auto vecFloat32 = data_->getVectorsFloat32Copy(stream);
+            fromDevice(vecFloat32, index->get_xb(), stream);
+        } else {
+            fromDevice(data_->getVectorsFloat32Ref(), index->get_xb(), stream);
+        }
+    }
+}
+
+size_t GpuIndexFlat::getNumVecs() const {
+    return this->ntotal;
+}
+
+void GpuIndexFlat::reset() {
+    DeviceScope scope(config_.device);
+
+    // Free the underlying memory
+    data_->reset();
+    this->ntotal = 0;
+}
+
+void GpuIndexFlat::train(Index::idx_t n, const float* x) {
+    // nothing to do
+}
+
+void GpuIndexFlat::add(Index::idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
+
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %d indices",
+            std::numeric_limits<int>::max());
+
+    if (n == 0) {
+        // nothing to add
+        return;
+    }
+
+    DeviceScope scope(config_.device);
+
+    // To avoid multiple re-allocations, ensure we have enough storage
+    // available
+    data_->reserve(n, resources_->getDefaultStream(config_.device));
+
+    // If we're not operating in float16 mode, we don't need the input
+    // data to be resident on our device; we can add directly.
+    if (!flatConfig_.useFloat16) {
+        addImpl_(n, x, nullptr);
+    } else {
+        // Otherwise, perform the paging
+        GpuIndex::add(n, x);
+    }
+}
+
+bool GpuIndexFlat::addImplRequiresIDs_() const {
+    return false;
+}
+
+void GpuIndexFlat::addImpl_(int n, const float* x, const Index::idx_t* ids) {
+    FAISS_ASSERT(data_);
+    FAISS_ASSERT(n > 0);
+
+    // We do not support add_with_ids
+    FAISS_THROW_IF_NOT_MSG(!ids, "add_with_ids not supported");
+
+    // Due to GPU indexing in int32, we can't store more than this
+    // number of vectors on a GPU
+    FAISS_THROW_IF_NOT_FMT(
+            this->ntotal + n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices",
+            (size_t)std::numeric_limits<int>::max());
+
+    data_->add(x, n, resources_->getDefaultStream(config_.device));
+    this->ntotal += n;
+}
+
+void GpuIndexFlat::searchImpl_(
+        int n,
+        const float* x,
+        int k,
+        float* distances,
+        Index::idx_t* labels) const {
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    // Input and output data are already resident on the GPU
+    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int)this->d});
+    Tensor<float, 2, true> outDistances(distances, {n, k});
+    Tensor<Index::idx_t, 2, true> outLabels(labels, {n, k});
+
+    // FlatIndex only supports int indices
+    DeviceTensor<int, 2, true> outIntLabels(
+            resources_.get(), makeTempAlloc(AllocType::Other, stream), {n, k});
+
+    data_->query(
+            queries,
+            k,
+            metric_type,
+            metric_arg,
+            outDistances,
+            outIntLabels,
+            true);
+
+    // Convert int to idx_t
+    convertTensor<int, Index::idx_t, 2>(stream, outIntLabels, outLabels);
+}
+
+void GpuIndexFlat::reconstruct(Index::idx_t key, float* out) const {
+    DeviceScope scope(config_.device);
+
+    FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    if (flatConfig_.useFloat16) {
+        // FIXME jhj: kernel for copy
+        auto vec = data_->getVectorsFloat32Copy(key, 1, stream);
+        fromDevice(vec.data(), out, this->d, stream);
+    } else {
+        auto vec = data_->getVectorsFloat32Ref()[key];
+        fromDevice(vec.data(), out, this->d, stream);
+    }
+}
+
+void GpuIndexFlat::reconstruct_n(Index::idx_t i0, Index::idx_t num, float* out)
+        const {
+    DeviceScope scope(config_.device);
+
+    FAISS_THROW_IF_NOT_MSG(i0 < this->ntotal, "index out of bounds");
+    FAISS_THROW_IF_NOT_MSG(i0 + num - 1 < this->ntotal, "num out of bounds");
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    if (flatConfig_.useFloat16) {
+        // FIXME jhj: kernel for copy
+        auto vec = data_->getVectorsFloat32Copy(i0, num, stream);
+        fromDevice(vec.data(), out, num * this->d, stream);
+    } else {
+        auto vec = data_->getVectorsFloat32Ref()[i0];
+        fromDevice(vec.data(), out, this->d * num, stream);
+    }
+}
+
+void GpuIndexFlat::compute_residual(
+        const float* x,
+        float* residual,
+        Index::idx_t key) const {
+    compute_residual_n(1, x, residual, &key);
+}
+
+void GpuIndexFlat::compute_residual_n(
+        Index::idx_t n,
+        const float* xs,
+        float* residuals,
+        const Index::idx_t* keys) const {
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices",
+            (size_t)std::numeric_limits<int>::max());
+
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    DeviceScope scope(config_.device);
+
+    auto vecsDevice = toDeviceTemporary<float, 2>(
+            resources_.get(),
+            config_.device,
+            const_cast<float*>(xs),
+            stream,
+            {(int)n, (int)this->d});
+    auto idsDevice = toDeviceTemporary<Index::idx_t, 1>(
+            resources_.get(),
+            config_.device,
+            const_cast<Index::idx_t*>(keys),
+            stream,
+            {(int)n});
+
+    auto residualDevice = toDeviceTemporary<float, 2>(
+            resources_.get(),
+            config_.device,
+            residuals,
+            stream,
+            {(int)n, (int)this->d});
+
+    // Convert idx_t to int
+    auto keysInt = convertTensorTemporary<Index::idx_t, int, 1>(
+            resources_.get(), stream, idsDevice);
+
+    FAISS_ASSERT(data_);
+    data_->computeResidual(vecsDevice, keysInt, residualDevice);
+
+    fromDevice<float, 2>(residualDevice, residuals, stream);
+}
+
+//
+// GpuIndexFlatL2
+//
+
+GpuIndexFlatL2::GpuIndexFlatL2(
+        GpuResourcesProvider* provider,
+        faiss::IndexFlatL2* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, index, config) {}
+
+GpuIndexFlatL2::GpuIndexFlatL2(
+        std::shared_ptr<GpuResources> resources,
+        faiss::IndexFlatL2* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, index, config) {}
+
+GpuIndexFlatL2::GpuIndexFlatL2(
+        GpuResourcesProvider* provider,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, dims, faiss::METRIC_L2, config) {}
+
+GpuIndexFlatL2::GpuIndexFlatL2(
+        std::shared_ptr<GpuResources> resources,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {}
+
+void GpuIndexFlatL2::copyFrom(faiss::IndexFlat* index) {
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a GpuIndexFlatL2 from an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyFrom(index);
+}
+
+void GpuIndexFlatL2::copyTo(faiss::IndexFlat* index) {
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a GpuIndexFlatL2 to an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyTo(index);
+}
+
+//
+// GpuIndexFlatIP
+//
+
+GpuIndexFlatIP::GpuIndexFlatIP(
+        GpuResourcesProvider* provider,
+        faiss::IndexFlatIP* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, index, config) {}
+
+GpuIndexFlatIP::GpuIndexFlatIP(
+        std::shared_ptr<GpuResources> resources,
+        faiss::IndexFlatIP* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, index, config) {}
+
+GpuIndexFlatIP::GpuIndexFlatIP(
+        GpuResourcesProvider* provider,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, dims, faiss::METRIC_INNER_PRODUCT, config) {}
+
+GpuIndexFlatIP::GpuIndexFlatIP(
+        std::shared_ptr<GpuResources> resources,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {}
+
+void GpuIndexFlatIP::copyFrom(faiss::IndexFlat* index) {
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a GpuIndexFlatIP from an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyFrom(index);
+}
+
+void GpuIndexFlatIP::copyTo(faiss::IndexFlat* index) {
+    // The passed in index must be IP
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a GpuIndexFlatIP to an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyTo(index);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndexFlat.cu.prehip
+++ b/faiss/gpu/GpuIndexFlat.cu.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexFlat.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <limits>
+
+namespace faiss {
+namespace gpu {
+
+GpuIndexFlat::GpuIndexFlat(
+        GpuResourcesProvider* provider,
+        const faiss::IndexFlat* index,
+        GpuIndexFlatConfig config)
+        : GpuIndex(
+                  provider->getResources(),
+                  index->d,
+                  index->metric_type,
+                  index->metric_arg,
+                  config),
+          flatConfig_(config) {
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    copyFrom(index);
+}
+
+GpuIndexFlat::GpuIndexFlat(
+        std::shared_ptr<GpuResources> resources,
+        const faiss::IndexFlat* index,
+        GpuIndexFlatConfig config)
+        : GpuIndex(
+                  resources,
+                  index->d,
+                  index->metric_type,
+                  index->metric_arg,
+                  config),
+          flatConfig_(config) {
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    copyFrom(index);
+}
+
+GpuIndexFlat::GpuIndexFlat(
+        GpuResourcesProvider* provider,
+        int dims,
+        faiss::MetricType metric,
+        GpuIndexFlatConfig config)
+        : GpuIndex(provider->getResources(), dims, metric, 0, config),
+          flatConfig_(config) {
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    // Construct index
+    DeviceScope scope(config_.device);
+    data_.reset(new FlatIndex(
+            resources_.get(),
+            dims,
+            flatConfig_.useFloat16,
+            flatConfig_.storeTransposed,
+            config_.memorySpace));
+}
+
+GpuIndexFlat::GpuIndexFlat(
+        std::shared_ptr<GpuResources> resources,
+        int dims,
+        faiss::MetricType metric,
+        GpuIndexFlatConfig config)
+        : GpuIndex(resources, dims, metric, 0, config), flatConfig_(config) {
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    // Construct index
+    DeviceScope scope(config_.device);
+    data_.reset(new FlatIndex(
+            resources_.get(),
+            dims,
+            flatConfig_.useFloat16,
+            flatConfig_.storeTransposed,
+            config_.memorySpace));
+}
+
+GpuIndexFlat::~GpuIndexFlat() {}
+
+void GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
+    DeviceScope scope(config_.device);
+
+    GpuIndex::copyFrom(index);
+
+    // GPU code has 32 bit indices
+    FAISS_THROW_IF_NOT_FMT(
+            index->ntotal <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices; "
+            "attempting to copy CPU index with %zu parameters",
+            (size_t)std::numeric_limits<int>::max(),
+            (size_t)index->ntotal);
+
+    data_.reset();
+    data_.reset(new FlatIndex(
+            resources_.get(),
+            this->d,
+            flatConfig_.useFloat16,
+            flatConfig_.storeTransposed,
+            config_.memorySpace));
+
+    // The index could be empty
+    if (index->ntotal > 0) {
+        data_->add(
+                index->get_xb(),
+                index->ntotal,
+                resources_->getDefaultStream(config_.device));
+    }
+}
+
+void GpuIndexFlat::copyTo(faiss::IndexFlat* index) const {
+    DeviceScope scope(config_.device);
+
+    GpuIndex::copyTo(index);
+    index->code_size = sizeof(float) * this->d;
+
+    FAISS_ASSERT(data_);
+    FAISS_ASSERT(data_->getSize() == this->ntotal);
+    index->codes.resize(this->ntotal * index->code_size);
+
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    if (this->ntotal > 0) {
+        if (flatConfig_.useFloat16) {
+            auto vecFloat32 = data_->getVectorsFloat32Copy(stream);
+            fromDevice(vecFloat32, index->get_xb(), stream);
+        } else {
+            fromDevice(data_->getVectorsFloat32Ref(), index->get_xb(), stream);
+        }
+    }
+}
+
+size_t GpuIndexFlat::getNumVecs() const {
+    return this->ntotal;
+}
+
+void GpuIndexFlat::reset() {
+    DeviceScope scope(config_.device);
+
+    // Free the underlying memory
+    data_->reset();
+    this->ntotal = 0;
+}
+
+void GpuIndexFlat::train(Index::idx_t n, const float* x) {
+    // nothing to do
+}
+
+void GpuIndexFlat::add(Index::idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
+
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %d indices",
+            std::numeric_limits<int>::max());
+
+    if (n == 0) {
+        // nothing to add
+        return;
+    }
+
+    DeviceScope scope(config_.device);
+
+    // To avoid multiple re-allocations, ensure we have enough storage
+    // available
+    data_->reserve(n, resources_->getDefaultStream(config_.device));
+
+    // If we're not operating in float16 mode, we don't need the input
+    // data to be resident on our device; we can add directly.
+    if (!flatConfig_.useFloat16) {
+        addImpl_(n, x, nullptr);
+    } else {
+        // Otherwise, perform the paging
+        GpuIndex::add(n, x);
+    }
+}
+
+bool GpuIndexFlat::addImplRequiresIDs_() const {
+    return false;
+}
+
+void GpuIndexFlat::addImpl_(int n, const float* x, const Index::idx_t* ids) {
+    FAISS_ASSERT(data_);
+    FAISS_ASSERT(n > 0);
+
+    // We do not support add_with_ids
+    FAISS_THROW_IF_NOT_MSG(!ids, "add_with_ids not supported");
+
+    // Due to GPU indexing in int32, we can't store more than this
+    // number of vectors on a GPU
+    FAISS_THROW_IF_NOT_FMT(
+            this->ntotal + n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices",
+            (size_t)std::numeric_limits<int>::max());
+
+    data_->add(x, n, resources_->getDefaultStream(config_.device));
+    this->ntotal += n;
+}
+
+void GpuIndexFlat::searchImpl_(
+        int n,
+        const float* x,
+        int k,
+        float* distances,
+        Index::idx_t* labels) const {
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    // Input and output data are already resident on the GPU
+    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int)this->d});
+    Tensor<float, 2, true> outDistances(distances, {n, k});
+    Tensor<Index::idx_t, 2, true> outLabels(labels, {n, k});
+
+    // FlatIndex only supports int indices
+    DeviceTensor<int, 2, true> outIntLabels(
+            resources_.get(), makeTempAlloc(AllocType::Other, stream), {n, k});
+
+    data_->query(
+            queries,
+            k,
+            metric_type,
+            metric_arg,
+            outDistances,
+            outIntLabels,
+            true);
+
+    // Convert int to idx_t
+    convertTensor<int, Index::idx_t, 2>(stream, outIntLabels, outLabels);
+}
+
+void GpuIndexFlat::reconstruct(Index::idx_t key, float* out) const {
+    DeviceScope scope(config_.device);
+
+    FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    if (flatConfig_.useFloat16) {
+        // FIXME jhj: kernel for copy
+        auto vec = data_->getVectorsFloat32Copy(key, 1, stream);
+        fromDevice(vec.data(), out, this->d, stream);
+    } else {
+        auto vec = data_->getVectorsFloat32Ref()[key];
+        fromDevice(vec.data(), out, this->d, stream);
+    }
+}
+
+void GpuIndexFlat::reconstruct_n(Index::idx_t i0, Index::idx_t num, float* out)
+        const {
+    DeviceScope scope(config_.device);
+
+    FAISS_THROW_IF_NOT_MSG(i0 < this->ntotal, "index out of bounds");
+    FAISS_THROW_IF_NOT_MSG(i0 + num - 1 < this->ntotal, "num out of bounds");
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    if (flatConfig_.useFloat16) {
+        // FIXME jhj: kernel for copy
+        auto vec = data_->getVectorsFloat32Copy(i0, num, stream);
+        fromDevice(vec.data(), out, num * this->d, stream);
+    } else {
+        auto vec = data_->getVectorsFloat32Ref()[i0];
+        fromDevice(vec.data(), out, this->d * num, stream);
+    }
+}
+
+void GpuIndexFlat::compute_residual(
+        const float* x,
+        float* residual,
+        Index::idx_t key) const {
+    compute_residual_n(1, x, residual, &key);
+}
+
+void GpuIndexFlat::compute_residual_n(
+        Index::idx_t n,
+        const float* xs,
+        float* residuals,
+        const Index::idx_t* keys) const {
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices",
+            (size_t)std::numeric_limits<int>::max());
+
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    DeviceScope scope(config_.device);
+
+    auto vecsDevice = toDeviceTemporary<float, 2>(
+            resources_.get(),
+            config_.device,
+            const_cast<float*>(xs),
+            stream,
+            {(int)n, (int)this->d});
+    auto idsDevice = toDeviceTemporary<Index::idx_t, 1>(
+            resources_.get(),
+            config_.device,
+            const_cast<Index::idx_t*>(keys),
+            stream,
+            {(int)n});
+
+    auto residualDevice = toDeviceTemporary<float, 2>(
+            resources_.get(),
+            config_.device,
+            residuals,
+            stream,
+            {(int)n, (int)this->d});
+
+    // Convert idx_t to int
+    auto keysInt = convertTensorTemporary<Index::idx_t, int, 1>(
+            resources_.get(), stream, idsDevice);
+
+    FAISS_ASSERT(data_);
+    data_->computeResidual(vecsDevice, keysInt, residualDevice);
+
+    fromDevice<float, 2>(residualDevice, residuals, stream);
+}
+
+//
+// GpuIndexFlatL2
+//
+
+GpuIndexFlatL2::GpuIndexFlatL2(
+        GpuResourcesProvider* provider,
+        faiss::IndexFlatL2* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, index, config) {}
+
+GpuIndexFlatL2::GpuIndexFlatL2(
+        std::shared_ptr<GpuResources> resources,
+        faiss::IndexFlatL2* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, index, config) {}
+
+GpuIndexFlatL2::GpuIndexFlatL2(
+        GpuResourcesProvider* provider,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, dims, faiss::METRIC_L2, config) {}
+
+GpuIndexFlatL2::GpuIndexFlatL2(
+        std::shared_ptr<GpuResources> resources,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, dims, faiss::METRIC_L2, config) {}
+
+void GpuIndexFlatL2::copyFrom(faiss::IndexFlat* index) {
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a GpuIndexFlatL2 from an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyFrom(index);
+}
+
+void GpuIndexFlatL2::copyTo(faiss::IndexFlat* index) {
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a GpuIndexFlatL2 to an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyTo(index);
+}
+
+//
+// GpuIndexFlatIP
+//
+
+GpuIndexFlatIP::GpuIndexFlatIP(
+        GpuResourcesProvider* provider,
+        faiss::IndexFlatIP* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, index, config) {}
+
+GpuIndexFlatIP::GpuIndexFlatIP(
+        std::shared_ptr<GpuResources> resources,
+        faiss::IndexFlatIP* index,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, index, config) {}
+
+GpuIndexFlatIP::GpuIndexFlatIP(
+        GpuResourcesProvider* provider,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(provider, dims, faiss::METRIC_INNER_PRODUCT, config) {}
+
+GpuIndexFlatIP::GpuIndexFlatIP(
+        std::shared_ptr<GpuResources> resources,
+        int dims,
+        GpuIndexFlatConfig config)
+        : GpuIndexFlat(resources, dims, faiss::METRIC_INNER_PRODUCT, config) {}
+
+void GpuIndexFlatIP::copyFrom(faiss::IndexFlat* index) {
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a GpuIndexFlatIP from an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyFrom(index);
+}
+
+void GpuIndexFlatIP::copyTo(faiss::IndexFlat* index) {
+    // The passed in index must be IP
+    FAISS_THROW_IF_NOT_MSG(
+            index->metric_type == metric_type,
+            "Cannot copy a GpuIndexFlatIP to an index of "
+            "different metric_type");
+
+    GpuIndexFlat::copyTo(index);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndexFlat.h
+++ b/faiss/gpu/GpuIndexFlat.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndex.h>
+#include <memory>
+
+namespace faiss {
+
+struct IndexFlat;
+struct IndexFlatL2;
+struct IndexFlatIP;
+
+} // namespace faiss
+
+namespace faiss {
+namespace gpu {
+
+class FlatIndex;
+
+struct GpuIndexFlatConfig : public GpuIndexConfig {
+    inline GpuIndexFlatConfig() : useFloat16(false), storeTransposed(false) {}
+
+    /// Whether or not data is stored as float16
+    bool useFloat16;
+
+    /// Whether or not data is stored (transparently) in a transposed
+    /// layout, enabling use of the NN GEMM call, which is ~10% faster.
+    /// This will improve the speed of the flat index, but will
+    /// substantially slow down any add() calls made, as all data must
+    /// be transposed, and will increase storage requirements (we store
+    /// data in both transposed and non-transposed layouts).
+    bool storeTransposed;
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexFlat; copies over centroid data from a given
+/// faiss::IndexFlat
+class GpuIndexFlat : public GpuIndex {
+   public:
+    /// Construct from a pre-existing faiss::IndexFlat instance, copying
+    /// data over to the given GPU
+    GpuIndexFlat(
+            GpuResourcesProvider* provider,
+            const faiss::IndexFlat* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlat(
+            std::shared_ptr<GpuResources> resources,
+            const faiss::IndexFlat* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Construct an empty instance that can be added to
+    GpuIndexFlat(
+            GpuResourcesProvider* provider,
+            int dims,
+            faiss::MetricType metric,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlat(
+            std::shared_ptr<GpuResources> resources,
+            int dims,
+            faiss::MetricType metric,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    ~GpuIndexFlat() override;
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(const faiss::IndexFlat* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexFlat* index) const;
+
+    /// Returns the number of vectors we contain
+    size_t getNumVecs() const;
+
+    /// Clears all vectors from this index
+    void reset() override;
+
+    /// This index is not trained, so this does nothing
+    void train(Index::idx_t n, const float* x) override;
+
+    /// Overrides to avoid excessive copies
+    void add(Index::idx_t, const float* x) override;
+
+    /// Reconstruction methods; prefer the batch reconstruct as it will
+    /// be more efficient
+    void reconstruct(Index::idx_t key, float* out) const override;
+
+    /// Batch reconstruction method
+    void reconstruct_n(Index::idx_t i0, Index::idx_t num, float* out)
+            const override;
+
+    /// Compute residual
+    void compute_residual(const float* x, float* residual, Index::idx_t key)
+            const override;
+
+    /// Compute residual (batch mode)
+    void compute_residual_n(
+            Index::idx_t n,
+            const float* xs,
+            float* residuals,
+            const Index::idx_t* keys) const override;
+
+    /// For internal access
+    inline FlatIndex* getGpuData() {
+        return data_.get();
+    }
+
+   protected:
+    /// Flat index does not require IDs as there is no storage available for
+    /// them
+    bool addImplRequiresIDs_() const override;
+
+    /// Called from GpuIndex for add
+    void addImpl_(int n, const float* x, const Index::idx_t* ids) override;
+
+    /// Called from GpuIndex for search
+    void searchImpl_(
+            int n,
+            const float* x,
+            int k,
+            float* distances,
+            Index::idx_t* labels) const override;
+
+   protected:
+    /// Our configuration options
+    const GpuIndexFlatConfig flatConfig_;
+
+    /// Holds our GPU data containing the list of vectors
+    std::unique_ptr<FlatIndex> data_;
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexFlatL2; copies over centroid data from a given
+/// faiss::IndexFlat
+class GpuIndexFlatL2 : public GpuIndexFlat {
+   public:
+    /// Construct from a pre-existing faiss::IndexFlatL2 instance, copying
+    /// data over to the given GPU
+    GpuIndexFlatL2(
+            GpuResourcesProvider* provider,
+            faiss::IndexFlatL2* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlatL2(
+            std::shared_ptr<GpuResources> resources,
+            faiss::IndexFlatL2* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Construct an empty instance that can be added to
+    GpuIndexFlatL2(
+            GpuResourcesProvider* provider,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlatL2(
+            std::shared_ptr<GpuResources> resources,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(faiss::IndexFlat* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexFlat* index);
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexFlatIP; copies over centroid data from a given
+/// faiss::IndexFlat
+class GpuIndexFlatIP : public GpuIndexFlat {
+   public:
+    /// Construct from a pre-existing faiss::IndexFlatIP instance, copying
+    /// data over to the given GPU
+    GpuIndexFlatIP(
+            GpuResourcesProvider* provider,
+            faiss::IndexFlatIP* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlatIP(
+            std::shared_ptr<GpuResources> resources,
+            faiss::IndexFlatIP* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Construct an empty instance that can be added to
+    GpuIndexFlatIP(
+            GpuResourcesProvider* provider,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlatIP(
+            std::shared_ptr<GpuResources> resources,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(faiss::IndexFlat* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexFlat* index);
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndexFlat.h.prehip
+++ b/faiss/gpu/GpuIndexFlat.h.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndex.h>
+#include <memory>
+
+namespace faiss {
+
+struct IndexFlat;
+struct IndexFlatL2;
+struct IndexFlatIP;
+
+} // namespace faiss
+
+namespace faiss {
+namespace gpu {
+
+class FlatIndex;
+
+struct GpuIndexFlatConfig : public GpuIndexConfig {
+    inline GpuIndexFlatConfig() : useFloat16(false), storeTransposed(false) {}
+
+    /// Whether or not data is stored as float16
+    bool useFloat16;
+
+    /// Whether or not data is stored (transparently) in a transposed
+    /// layout, enabling use of the NN GEMM call, which is ~10% faster.
+    /// This will improve the speed of the flat index, but will
+    /// substantially slow down any add() calls made, as all data must
+    /// be transposed, and will increase storage requirements (we store
+    /// data in both transposed and non-transposed layouts).
+    bool storeTransposed;
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexFlat; copies over centroid data from a given
+/// faiss::IndexFlat
+class GpuIndexFlat : public GpuIndex {
+   public:
+    /// Construct from a pre-existing faiss::IndexFlat instance, copying
+    /// data over to the given GPU
+    GpuIndexFlat(
+            GpuResourcesProvider* provider,
+            const faiss::IndexFlat* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlat(
+            std::shared_ptr<GpuResources> resources,
+            const faiss::IndexFlat* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Construct an empty instance that can be added to
+    GpuIndexFlat(
+            GpuResourcesProvider* provider,
+            int dims,
+            faiss::MetricType metric,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlat(
+            std::shared_ptr<GpuResources> resources,
+            int dims,
+            faiss::MetricType metric,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    ~GpuIndexFlat() override;
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(const faiss::IndexFlat* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexFlat* index) const;
+
+    /// Returns the number of vectors we contain
+    size_t getNumVecs() const;
+
+    /// Clears all vectors from this index
+    void reset() override;
+
+    /// This index is not trained, so this does nothing
+    void train(Index::idx_t n, const float* x) override;
+
+    /// Overrides to avoid excessive copies
+    void add(Index::idx_t, const float* x) override;
+
+    /// Reconstruction methods; prefer the batch reconstruct as it will
+    /// be more efficient
+    void reconstruct(Index::idx_t key, float* out) const override;
+
+    /// Batch reconstruction method
+    void reconstruct_n(Index::idx_t i0, Index::idx_t num, float* out)
+            const override;
+
+    /// Compute residual
+    void compute_residual(const float* x, float* residual, Index::idx_t key)
+            const override;
+
+    /// Compute residual (batch mode)
+    void compute_residual_n(
+            Index::idx_t n,
+            const float* xs,
+            float* residuals,
+            const Index::idx_t* keys) const override;
+
+    /// For internal access
+    inline FlatIndex* getGpuData() {
+        return data_.get();
+    }
+
+   protected:
+    /// Flat index does not require IDs as there is no storage available for
+    /// them
+    bool addImplRequiresIDs_() const override;
+
+    /// Called from GpuIndex for add
+    void addImpl_(int n, const float* x, const Index::idx_t* ids) override;
+
+    /// Called from GpuIndex for search
+    void searchImpl_(
+            int n,
+            const float* x,
+            int k,
+            float* distances,
+            Index::idx_t* labels) const override;
+
+   protected:
+    /// Our configuration options
+    const GpuIndexFlatConfig flatConfig_;
+
+    /// Holds our GPU data containing the list of vectors
+    std::unique_ptr<FlatIndex> data_;
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexFlatL2; copies over centroid data from a given
+/// faiss::IndexFlat
+class GpuIndexFlatL2 : public GpuIndexFlat {
+   public:
+    /// Construct from a pre-existing faiss::IndexFlatL2 instance, copying
+    /// data over to the given GPU
+    GpuIndexFlatL2(
+            GpuResourcesProvider* provider,
+            faiss::IndexFlatL2* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlatL2(
+            std::shared_ptr<GpuResources> resources,
+            faiss::IndexFlatL2* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Construct an empty instance that can be added to
+    GpuIndexFlatL2(
+            GpuResourcesProvider* provider,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlatL2(
+            std::shared_ptr<GpuResources> resources,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(faiss::IndexFlat* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexFlat* index);
+};
+
+/// Wrapper around the GPU implementation that looks like
+/// faiss::IndexFlatIP; copies over centroid data from a given
+/// faiss::IndexFlat
+class GpuIndexFlatIP : public GpuIndexFlat {
+   public:
+    /// Construct from a pre-existing faiss::IndexFlatIP instance, copying
+    /// data over to the given GPU
+    GpuIndexFlatIP(
+            GpuResourcesProvider* provider,
+            faiss::IndexFlatIP* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlatIP(
+            std::shared_ptr<GpuResources> resources,
+            faiss::IndexFlatIP* index,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Construct an empty instance that can be added to
+    GpuIndexFlatIP(
+            GpuResourcesProvider* provider,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    GpuIndexFlatIP(
+            std::shared_ptr<GpuResources> resources,
+            int dims,
+            GpuIndexFlatConfig config = GpuIndexFlatConfig());
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(faiss::IndexFlat* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexFlat* index);
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVF.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/Float16.cuh>
+
+namespace faiss {
+namespace gpu {
+
+GpuIndexIVF::GpuIndexIVF(
+        GpuResourcesProvider* provider,
+        int dims,
+        faiss::MetricType metric,
+        float metricArg,
+        int nlistIn,
+        GpuIndexIVFConfig config)
+        : GpuIndex(provider->getResources(), dims, metric, metricArg, config),
+          nlist(nlistIn),
+          nprobe(1),
+          quantizer(nullptr),
+          ivfConfig_(config) {
+    init_();
+
+    // Only IP and L2 are supported for now
+    if (!(metric_type == faiss::METRIC_L2 ||
+          metric_type == faiss::METRIC_INNER_PRODUCT)) {
+        FAISS_THROW_FMT("unsupported metric type %d", (int)metric_type);
+    }
+}
+
+void GpuIndexIVF::init_() {
+    FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be > 0");
+
+    // Spherical by default if the metric is inner_product
+    if (metric_type == faiss::METRIC_INNER_PRODUCT) {
+        cp.spherical = true;
+    }
+
+    // here we set a low # iterations because this is typically used
+    // for large clusterings
+    cp.niter = 10;
+    cp.verbose = verbose;
+
+    if (!quantizer) {
+        // Construct an empty quantizer
+        GpuIndexFlatConfig config = ivfConfig_.flatConfig;
+        // FIXME: inherit our same device
+        config.device = config_.device;
+
+        if (metric_type == faiss::METRIC_L2) {
+            quantizer = new GpuIndexFlatL2(resources_, d, config);
+        } else if (metric_type == faiss::METRIC_INNER_PRODUCT) {
+            quantizer = new GpuIndexFlatIP(resources_, d, config);
+        } else {
+            // unknown metric type
+            FAISS_THROW_FMT("unsupported metric type %d", (int)metric_type);
+        }
+    }
+}
+
+GpuIndexIVF::~GpuIndexIVF() {
+    delete quantizer;
+}
+
+GpuIndexFlat* GpuIndexIVF::getQuantizer() {
+    return quantizer;
+}
+
+void GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) {
+    DeviceScope scope(config_.device);
+
+    GpuIndex::copyFrom(index);
+
+    FAISS_ASSERT(index->nlist > 0);
+    FAISS_THROW_IF_NOT_FMT(
+            index->nlist <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports %zu inverted lists",
+            (size_t)std::numeric_limits<int>::max());
+    nlist = index->nlist;
+
+    FAISS_THROW_IF_NOT_FMT(
+            index->nprobe > 0 && index->nprobe <= getMaxKSelection(),
+            "GPU index only supports nprobe <= %zu; passed %zu",
+            (size_t)getMaxKSelection(),
+            index->nprobe);
+    nprobe = index->nprobe;
+
+    // The metric type may have changed as well, so we might have to
+    // change our quantizer
+    delete quantizer;
+    quantizer = nullptr;
+
+    // Construct an empty quantizer
+    GpuIndexFlatConfig config = ivfConfig_.flatConfig;
+    // FIXME: inherit our same device
+    config.device = config_.device;
+
+    if (index->metric_type == faiss::METRIC_L2) {
+        // FIXME: 2 different float16 options?
+        quantizer = new GpuIndexFlatL2(resources_, this->d, config);
+    } else if (index->metric_type == faiss::METRIC_INNER_PRODUCT) {
+        // FIXME: 2 different float16 options?
+        quantizer = new GpuIndexFlatIP(resources_, this->d, config);
+    } else {
+        // unknown metric type
+        FAISS_ASSERT(false);
+    }
+
+    if (!index->is_trained) {
+        // copied in GpuIndex::copyFrom
+        FAISS_ASSERT(!is_trained && ntotal == 0);
+        return;
+    }
+
+    // copied in GpuIndex::copyFrom
+    // ntotal can exceed max int, but the number of vectors per inverted
+    // list cannot exceed this. We check this in the subclasses.
+    FAISS_ASSERT(is_trained && (ntotal == index->ntotal));
+
+    // Since we're trained, the quantizer must have data
+    FAISS_ASSERT(index->quantizer->ntotal > 0);
+
+    // Right now, we can only handle IndexFlat or derived classes
+    auto qFlat = dynamic_cast<faiss::IndexFlat*>(index->quantizer);
+    FAISS_THROW_IF_NOT_MSG(
+            qFlat,
+            "Only IndexFlat is supported for the coarse quantizer "
+            "for copying from an IndexIVF into a GpuIndexIVF");
+
+    quantizer->copyFrom(qFlat);
+}
+
+void GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
+    DeviceScope scope(config_.device);
+
+    //
+    // Index information
+    //
+    GpuIndex::copyTo(index);
+
+    //
+    // IndexIVF information
+    //
+    index->nlist = nlist;
+    index->nprobe = nprobe;
+
+    // Construct and copy the appropriate quantizer
+    faiss::IndexFlat* q = nullptr;
+
+    if (this->metric_type == faiss::METRIC_L2) {
+        q = new faiss::IndexFlatL2(this->d);
+
+    } else if (this->metric_type == faiss::METRIC_INNER_PRODUCT) {
+        q = new faiss::IndexFlatIP(this->d);
+
+    } else {
+        // we should have one of the above metrics
+        FAISS_ASSERT(false);
+    }
+
+    FAISS_ASSERT(quantizer);
+    quantizer->copyTo(q);
+
+    if (index->own_fields) {
+        delete index->quantizer;
+    }
+
+    index->quantizer = q;
+    index->quantizer_trains_alone = 0;
+    index->own_fields = true;
+    index->cp = this->cp;
+    index->make_direct_map(false);
+}
+
+int GpuIndexIVF::getNumLists() const {
+    return nlist;
+}
+
+void GpuIndexIVF::setNumProbes(int nprobe) {
+    FAISS_THROW_IF_NOT_FMT(
+            nprobe > 0 && nprobe <= getMaxKSelection(),
+            "GPU index only supports nprobe <= %d; passed %d",
+            getMaxKSelection(),
+            nprobe);
+    this->nprobe = nprobe;
+}
+
+int GpuIndexIVF::getNumProbes() const {
+    return nprobe;
+}
+
+bool GpuIndexIVF::addImplRequiresIDs_() const {
+    // All IVF indices have storage for IDs
+    return true;
+}
+
+void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) {
+    if (n == 0) {
+        // nothing to do
+        return;
+    }
+
+    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+        if (this->verbose) {
+            printf("IVF quantizer does not need training.\n");
+        }
+
+        return;
+    }
+
+    if (this->verbose) {
+        printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
+    }
+
+    DeviceScope scope(config_.device);
+
+    // leverage the CPU-side k-means code, which works for the GPU
+    // flat index as well
+    quantizer->reset();
+    Clustering clus(this->d, nlist, this->cp);
+    clus.verbose = verbose;
+    clus.train(n, x, *quantizer);
+    quantizer->is_trained = true;
+
+    FAISS_ASSERT(quantizer->ntotal == nlist);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndexIVF.cu.prehip
+++ b/faiss/gpu/GpuIndexIVF.cu.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVF.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/Float16.cuh>
+
+namespace faiss {
+namespace gpu {
+
+GpuIndexIVF::GpuIndexIVF(
+        GpuResourcesProvider* provider,
+        int dims,
+        faiss::MetricType metric,
+        float metricArg,
+        int nlistIn,
+        GpuIndexIVFConfig config)
+        : GpuIndex(provider->getResources(), dims, metric, metricArg, config),
+          nlist(nlistIn),
+          nprobe(1),
+          quantizer(nullptr),
+          ivfConfig_(config) {
+    init_();
+
+    // Only IP and L2 are supported for now
+    if (!(metric_type == faiss::METRIC_L2 ||
+          metric_type == faiss::METRIC_INNER_PRODUCT)) {
+        FAISS_THROW_FMT("unsupported metric type %d", (int)metric_type);
+    }
+}
+
+void GpuIndexIVF::init_() {
+    FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be > 0");
+
+    // Spherical by default if the metric is inner_product
+    if (metric_type == faiss::METRIC_INNER_PRODUCT) {
+        cp.spherical = true;
+    }
+
+    // here we set a low # iterations because this is typically used
+    // for large clusterings
+    cp.niter = 10;
+    cp.verbose = verbose;
+
+    if (!quantizer) {
+        // Construct an empty quantizer
+        GpuIndexFlatConfig config = ivfConfig_.flatConfig;
+        // FIXME: inherit our same device
+        config.device = config_.device;
+
+        if (metric_type == faiss::METRIC_L2) {
+            quantizer = new GpuIndexFlatL2(resources_, d, config);
+        } else if (metric_type == faiss::METRIC_INNER_PRODUCT) {
+            quantizer = new GpuIndexFlatIP(resources_, d, config);
+        } else {
+            // unknown metric type
+            FAISS_THROW_FMT("unsupported metric type %d", (int)metric_type);
+        }
+    }
+}
+
+GpuIndexIVF::~GpuIndexIVF() {
+    delete quantizer;
+}
+
+GpuIndexFlat* GpuIndexIVF::getQuantizer() {
+    return quantizer;
+}
+
+void GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) {
+    DeviceScope scope(config_.device);
+
+    GpuIndex::copyFrom(index);
+
+    FAISS_ASSERT(index->nlist > 0);
+    FAISS_THROW_IF_NOT_FMT(
+            index->nlist <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports %zu inverted lists",
+            (size_t)std::numeric_limits<int>::max());
+    nlist = index->nlist;
+
+    FAISS_THROW_IF_NOT_FMT(
+            index->nprobe > 0 && index->nprobe <= getMaxKSelection(),
+            "GPU index only supports nprobe <= %zu; passed %zu",
+            (size_t)getMaxKSelection(),
+            index->nprobe);
+    nprobe = index->nprobe;
+
+    // The metric type may have changed as well, so we might have to
+    // change our quantizer
+    delete quantizer;
+    quantizer = nullptr;
+
+    // Construct an empty quantizer
+    GpuIndexFlatConfig config = ivfConfig_.flatConfig;
+    // FIXME: inherit our same device
+    config.device = config_.device;
+
+    if (index->metric_type == faiss::METRIC_L2) {
+        // FIXME: 2 different float16 options?
+        quantizer = new GpuIndexFlatL2(resources_, this->d, config);
+    } else if (index->metric_type == faiss::METRIC_INNER_PRODUCT) {
+        // FIXME: 2 different float16 options?
+        quantizer = new GpuIndexFlatIP(resources_, this->d, config);
+    } else {
+        // unknown metric type
+        FAISS_ASSERT(false);
+    }
+
+    if (!index->is_trained) {
+        // copied in GpuIndex::copyFrom
+        FAISS_ASSERT(!is_trained && ntotal == 0);
+        return;
+    }
+
+    // copied in GpuIndex::copyFrom
+    // ntotal can exceed max int, but the number of vectors per inverted
+    // list cannot exceed this. We check this in the subclasses.
+    FAISS_ASSERT(is_trained && (ntotal == index->ntotal));
+
+    // Since we're trained, the quantizer must have data
+    FAISS_ASSERT(index->quantizer->ntotal > 0);
+
+    // Right now, we can only handle IndexFlat or derived classes
+    auto qFlat = dynamic_cast<faiss::IndexFlat*>(index->quantizer);
+    FAISS_THROW_IF_NOT_MSG(
+            qFlat,
+            "Only IndexFlat is supported for the coarse quantizer "
+            "for copying from an IndexIVF into a GpuIndexIVF");
+
+    quantizer->copyFrom(qFlat);
+}
+
+void GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
+    DeviceScope scope(config_.device);
+
+    //
+    // Index information
+    //
+    GpuIndex::copyTo(index);
+
+    //
+    // IndexIVF information
+    //
+    index->nlist = nlist;
+    index->nprobe = nprobe;
+
+    // Construct and copy the appropriate quantizer
+    faiss::IndexFlat* q = nullptr;
+
+    if (this->metric_type == faiss::METRIC_L2) {
+        q = new faiss::IndexFlatL2(this->d);
+
+    } else if (this->metric_type == faiss::METRIC_INNER_PRODUCT) {
+        q = new faiss::IndexFlatIP(this->d);
+
+    } else {
+        // we should have one of the above metrics
+        FAISS_ASSERT(false);
+    }
+
+    FAISS_ASSERT(quantizer);
+    quantizer->copyTo(q);
+
+    if (index->own_fields) {
+        delete index->quantizer;
+    }
+
+    index->quantizer = q;
+    index->quantizer_trains_alone = 0;
+    index->own_fields = true;
+    index->cp = this->cp;
+    index->make_direct_map(false);
+}
+
+int GpuIndexIVF::getNumLists() const {
+    return nlist;
+}
+
+void GpuIndexIVF::setNumProbes(int nprobe) {
+    FAISS_THROW_IF_NOT_FMT(
+            nprobe > 0 && nprobe <= getMaxKSelection(),
+            "GPU index only supports nprobe <= %d; passed %d",
+            getMaxKSelection(),
+            nprobe);
+    this->nprobe = nprobe;
+}
+
+int GpuIndexIVF::getNumProbes() const {
+    return nprobe;
+}
+
+bool GpuIndexIVF::addImplRequiresIDs_() const {
+    // All IVF indices have storage for IDs
+    return true;
+}
+
+void GpuIndexIVF::trainQuantizer_(Index::idx_t n, const float* x) {
+    if (n == 0) {
+        // nothing to do
+        return;
+    }
+
+    if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
+        if (this->verbose) {
+            printf("IVF quantizer does not need training.\n");
+        }
+
+        return;
+    }
+
+    if (this->verbose) {
+        printf("Training IVF quantizer on %ld vectors in %dD\n", n, d);
+    }
+
+    DeviceScope scope(config_.device);
+
+    // leverage the CPU-side k-means code, which works for the GPU
+    // flat index as well
+    quantizer->reset();
+    Clustering clus(this->d, nlist, this->cp);
+    clus.verbose = verbose;
+    clus.train(n, x, *quantizer);
+    quantizer->is_trained = true;
+
+    FAISS_ASSERT(quantizer->ntotal == nlist);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndexIVF.h
+++ b/faiss/gpu/GpuIndexIVF.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Clustering.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndicesOptions.h>
+
+namespace faiss {
+struct IndexIVF;
+}
+
+namespace faiss {
+namespace gpu {
+
+class GpuIndexFlat;
+
+struct GpuIndexIVFConfig : public GpuIndexConfig {
+    inline GpuIndexIVFConfig() : indicesOptions(INDICES_64_BIT) {}
+
+    /// Index storage options for the GPU
+    IndicesOptions indicesOptions;
+
+    /// Configuration for the coarse quantizer object
+    GpuIndexFlatConfig flatConfig;
+};
+
+class GpuIndexIVF : public GpuIndex {
+   public:
+    GpuIndexIVF(
+            GpuResourcesProvider* provider,
+            int dims,
+            faiss::MetricType metric,
+            float metricArg,
+            int nlist,
+            GpuIndexIVFConfig config = GpuIndexIVFConfig());
+
+    ~GpuIndexIVF() override;
+
+   private:
+    /// Shared initialization functions
+    void init_();
+
+   public:
+    /// Copy what we need from the CPU equivalent
+    void copyFrom(const faiss::IndexIVF* index);
+
+    /// Copy what we have to the CPU equivalent
+    void copyTo(faiss::IndexIVF* index) const;
+
+    /// Returns the number of inverted lists we're managing
+    int getNumLists() const;
+
+    /// Returns the number of vectors present in a particular inverted list
+    virtual int getListLength(int listId) const = 0;
+
+    /// Return the encoded vector data contained in a particular inverted list,
+    /// for debugging purposes.
+    /// If gpuFormat is true, the data is returned as it is encoded in the
+    /// GPU-side representation.
+    /// Otherwise, it is converted to the CPU format.
+    /// compliant format, while the native GPU format may differ.
+    virtual std::vector<uint8_t> getListVectorData(
+            int listId,
+            bool gpuFormat = false) const = 0;
+
+    /// Return the vector indices contained in a particular inverted list, for
+    /// debugging purposes.
+    virtual std::vector<Index::idx_t> getListIndices(int listId) const = 0;
+
+    /// Return the quantizer we're using
+    GpuIndexFlat* getQuantizer();
+
+    /// Sets the number of list probes per query
+    void setNumProbes(int nprobe);
+
+    /// Returns our current number of list probes per query
+    int getNumProbes() const;
+
+   protected:
+    bool addImplRequiresIDs_() const override;
+    void trainQuantizer_(Index::idx_t n, const float* x);
+
+   public:
+    /// Exposing this like the CPU version for manipulation
+    ClusteringParameters cp;
+
+    /// Exposing this like the CPU version for query
+    int nlist;
+
+    /// Exposing this like the CPU version for manipulation
+    int nprobe;
+
+    /// Exposeing this like the CPU version for query
+    GpuIndexFlat* quantizer;
+
+   protected:
+    /// Our configuration options
+    const GpuIndexIVFConfig ivfConfig_;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndexIVF.h.prehip
+++ b/faiss/gpu/GpuIndexIVF.h.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Clustering.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndicesOptions.h>
+
+namespace faiss {
+struct IndexIVF;
+}
+
+namespace faiss {
+namespace gpu {
+
+class GpuIndexFlat;
+
+struct GpuIndexIVFConfig : public GpuIndexConfig {
+    inline GpuIndexIVFConfig() : indicesOptions(INDICES_64_BIT) {}
+
+    /// Index storage options for the GPU
+    IndicesOptions indicesOptions;
+
+    /// Configuration for the coarse quantizer object
+    GpuIndexFlatConfig flatConfig;
+};
+
+class GpuIndexIVF : public GpuIndex {
+   public:
+    GpuIndexIVF(
+            GpuResourcesProvider* provider,
+            int dims,
+            faiss::MetricType metric,
+            float metricArg,
+            int nlist,
+            GpuIndexIVFConfig config = GpuIndexIVFConfig());
+
+    ~GpuIndexIVF() override;
+
+   private:
+    /// Shared initialization functions
+    void init_();
+
+   public:
+    /// Copy what we need from the CPU equivalent
+    void copyFrom(const faiss::IndexIVF* index);
+
+    /// Copy what we have to the CPU equivalent
+    void copyTo(faiss::IndexIVF* index) const;
+
+    /// Returns the number of inverted lists we're managing
+    int getNumLists() const;
+
+    /// Returns the number of vectors present in a particular inverted list
+    virtual int getListLength(int listId) const = 0;
+
+    /// Return the encoded vector data contained in a particular inverted list,
+    /// for debugging purposes.
+    /// If gpuFormat is true, the data is returned as it is encoded in the
+    /// GPU-side representation.
+    /// Otherwise, it is converted to the CPU format.
+    /// compliant format, while the native GPU format may differ.
+    virtual std::vector<uint8_t> getListVectorData(
+            int listId,
+            bool gpuFormat = false) const = 0;
+
+    /// Return the vector indices contained in a particular inverted list, for
+    /// debugging purposes.
+    virtual std::vector<Index::idx_t> getListIndices(int listId) const = 0;
+
+    /// Return the quantizer we're using
+    GpuIndexFlat* getQuantizer();
+
+    /// Sets the number of list probes per query
+    void setNumProbes(int nprobe);
+
+    /// Returns our current number of list probes per query
+    int getNumProbes() const;
+
+   protected:
+    bool addImplRequiresIDs_() const override;
+    void trainQuantizer_(Index::idx_t n, const float* x);
+
+   public:
+    /// Exposing this like the CPU version for manipulation
+    ClusteringParameters cp;
+
+    /// Exposing this like the CPU version for query
+    int nlist;
+
+    /// Exposing this like the CPU version for manipulation
+    int nprobe;
+
+    /// Exposeing this like the CPU version for query
+    GpuIndexFlat* quantizer;
+
+   protected:
+    /// Our configuration options
+    const GpuIndexIVFConfig ivfConfig_;
+};
+
+} // namespace gpu
+} // namespace faiss