init the faiss for rocm

395d2ce6 · huchen · 5ded39f5 · 395d2ce6 · 395d2ce6 · 395d2ce6
Commit 395d2ce6 authored Jun 01, 2022 by huchen
20 changed files
--- a/faiss/gpu/GpuClonerOptions.cpp
+++ b/faiss/gpu/GpuClonerOptions.cpp
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuClonerOptions.h>
+
+namespace faiss {
+namespace gpu {
+
+GpuClonerOptions::GpuClonerOptions()
+        : indicesOptions(INDICES_64_BIT),
+          useFloat16CoarseQuantizer(false),
+          useFloat16(false),
+          usePrecomputed(false),
+          reserveVecs(0),
+          storeTransposed(false),
+          verbose(false) {}
+
+GpuMultipleClonerOptions::GpuMultipleClonerOptions()
+        : shard(false), shard_type(1) {}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuClonerOptions.cpp.prehip
+++ b/faiss/gpu/GpuClonerOptions.cpp.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuClonerOptions.h>
+
+namespace faiss {
+namespace gpu {
+
+GpuClonerOptions::GpuClonerOptions()
+        : indicesOptions(INDICES_64_BIT),
+          useFloat16CoarseQuantizer(false),
+          useFloat16(false),
+          usePrecomputed(false),
+          reserveVecs(0),
+          storeTransposed(false),
+          verbose(false) {}
+
+GpuMultipleClonerOptions::GpuMultipleClonerOptions()
+        : shard(false), shard_type(1) {}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuClonerOptions.h
+++ b/faiss/gpu/GpuClonerOptions.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndicesOptions.h>
+
+namespace faiss {
+namespace gpu {
+
+/// set some options on how to copy to GPU
+struct GpuClonerOptions {
+    GpuClonerOptions();
+
+    /// how should indices be stored on index types that support indices
+    /// (anything but GpuIndexFlat*)?
+    IndicesOptions indicesOptions;
+
+    /// is the coarse quantizer in float16?
+    bool useFloat16CoarseQuantizer;
+
+    /// for GpuIndexIVFFlat, is storage in float16?
+    /// for GpuIndexIVFPQ, are intermediate calculations in float16?
+    bool useFloat16;
+
+    /// use precomputed tables?
+    bool usePrecomputed;
+
+    /// reserve vectors in the invfiles?
+    long reserveVecs;
+
+    /// For GpuIndexFlat, store data in transposed layout?
+    bool storeTransposed;
+
+    /// Set verbose options on the index
+    bool verbose;
+};
+
+struct GpuMultipleClonerOptions : public GpuClonerOptions {
+    GpuMultipleClonerOptions();
+
+    /// Whether to shard the index across GPUs, versus replication
+    /// across GPUs
+    bool shard;
+
+    /// IndexIVF::copy_subset_to subset type
+    int shard_type;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuClonerOptions.h.prehip
+++ b/faiss/gpu/GpuClonerOptions.h.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndicesOptions.h>
+
+namespace faiss {
+namespace gpu {
+
+/// set some options on how to copy to GPU
+struct GpuClonerOptions {
+    GpuClonerOptions();
+
+    /// how should indices be stored on index types that support indices
+    /// (anything but GpuIndexFlat*)?
+    IndicesOptions indicesOptions;
+
+    /// is the coarse quantizer in float16?
+    bool useFloat16CoarseQuantizer;
+
+    /// for GpuIndexIVFFlat, is storage in float16?
+    /// for GpuIndexIVFPQ, are intermediate calculations in float16?
+    bool useFloat16;
+
+    /// use precomputed tables?
+    bool usePrecomputed;
+
+    /// reserve vectors in the invfiles?
+    long reserveVecs;
+
+    /// For GpuIndexFlat, store data in transposed layout?
+    bool storeTransposed;
+
+    /// Set verbose options on the index
+    bool verbose;
+};
+
+struct GpuMultipleClonerOptions : public GpuClonerOptions {
+    GpuMultipleClonerOptions();
+
+    /// Whether to shard the index across GPUs, versus replication
+    /// across GPUs
+    bool shard;
+
+    /// IndexIVF::copy_subset_to subset type
+    int shard_type;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/impl/Distance.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+
+namespace faiss {
+namespace gpu {
+
+template <typename T>
+void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
+    // Validate the input data
+    FAISS_THROW_IF_NOT_MSG(
+            args.k > 0 || args.k == -1,
+            "bfKnn: k must be > 0 for top-k reduction, "
+            "or -1 for all pairwise distances");
+    FAISS_THROW_IF_NOT_MSG(args.dims > 0, "bfKnn: dims must be > 0");
+    FAISS_THROW_IF_NOT_MSG(
+            args.numVectors > 0, "bfKnn: numVectors must be > 0");
+    FAISS_THROW_IF_NOT_MSG(
+            args.vectors, "bfKnn: vectors must be provided (passed null)");
+    FAISS_THROW_IF_NOT_MSG(
+            args.numQueries > 0, "bfKnn: numQueries must be > 0");
+    FAISS_THROW_IF_NOT_MSG(
+            args.queries, "bfKnn: queries must be provided (passed null)");
+    FAISS_THROW_IF_NOT_MSG(
+            args.outDistances,
+            "bfKnn: outDistances must be provided (passed null)");
+    FAISS_THROW_IF_NOT_MSG(
+            args.outIndices || args.k == -1,
+            "bfKnn: outIndices must be provided (passed null)");
+
+    // Don't let the resources go out of scope
+    auto resImpl = prov->getResources();
+    auto res = resImpl.get();
+    auto device = getCurrentDevice();
+    auto stream = res->getDefaultStreamCurrentDevice();
+
+    auto tVectors = toDeviceTemporary<T, 2>(
+            res,
+            device,
+            const_cast<T*>(reinterpret_cast<const T*>(args.vectors)),
+            stream,
+            {args.vectorsRowMajor ? args.numVectors : args.dims,
+             args.vectorsRowMajor ? args.dims : args.numVectors});
+    auto tQueries = toDeviceTemporary<T, 2>(
+            res,
+            device,
+            const_cast<T*>(reinterpret_cast<const T*>(args.queries)),
+            stream,
+            {args.queriesRowMajor ? args.numQueries : args.dims,
+             args.queriesRowMajor ? args.dims : args.numQueries});
+
+    DeviceTensor<float, 1, true> tVectorNorms;
+    if (args.vectorNorms) {
+        tVectorNorms = toDeviceTemporary<float, 1>(
+                res,
+                device,
+                const_cast<float*>(args.vectorNorms),
+                stream,
+                {args.numVectors});
+    }
+
+    auto tOutDistances = toDeviceTemporary<float, 2>(
+            res,
+            device,
+            args.outDistances,
+            stream,
+            {args.numQueries, args.k == -1 ? args.numVectors : args.k});
+
+    if (args.k == -1) {
+        // Reporting all pairwise distances
+        allPairwiseDistanceOnDevice<T>(
+                res,
+                device,
+                stream,
+                tVectors,
+                args.vectorsRowMajor,
+                args.vectorNorms ? &tVectorNorms : nullptr,
+                tQueries,
+                args.queriesRowMajor,
+                args.metric,
+                args.metricArg,
+                tOutDistances);
+    } else if (args.outIndicesType == IndicesDataType::I64) {
+        // The brute-force API only supports an interface for i32 indices only,
+        // so we must create an output i32 buffer then convert back
+        DeviceTensor<int, 2, true> tOutIntIndices(
+                res,
+                makeTempAlloc(AllocType::Other, stream),
+                {args.numQueries, args.k});
+
+        // Since we've guaranteed that all arguments are on device, call the
+        // implementation
+        bfKnnOnDevice<T>(
+                res,
+                device,
+                stream,
+                tVectors,
+                args.vectorsRowMajor,
+                args.vectorNorms ? &tVectorNorms : nullptr,
+                tQueries,
+                args.queriesRowMajor,
+                args.k,
+                args.metric,
+                args.metricArg,
+                tOutDistances,
+                tOutIntIndices,
+                args.ignoreOutDistances);
+
+        // Convert and copy int indices out
+        auto tOutIndices = toDeviceTemporary<Index::idx_t, 2>(
+                res,
+                device,
+                (Index::idx_t*)args.outIndices,
+                stream,
+                {args.numQueries, args.k});
+
+        // Convert int to idx_t
+        convertTensor<int, Index::idx_t, 2>(
+                stream, tOutIntIndices, tOutIndices);
+
+        // Copy back if necessary
+        fromDevice<Index::idx_t, 2>(
+                tOutIndices, (Index::idx_t*)args.outIndices, stream);
+
+    } else if (args.outIndicesType == IndicesDataType::I32) {
+        // We can use the brute-force API directly, as it takes i32 indices
+        // FIXME: convert to int32_t everywhere?
+        static_assert(sizeof(int) == 4, "");
+
+        auto tOutIntIndices = toDeviceTemporary<int, 2>(
+                res,
+                device,
+                (int*)args.outIndices,
+                stream,
+                {args.numQueries, args.k});
+
+        // Since we've guaranteed that all arguments are on device, call the
+        // implementation
+        bfKnnOnDevice<T>(
+                res,
+                device,
+                stream,
+                tVectors,
+                args.vectorsRowMajor,
+                args.vectorNorms ? &tVectorNorms : nullptr,
+                tQueries,
+                args.queriesRowMajor,
+                args.k,
+                args.metric,
+                args.metricArg,
+                tOutDistances,
+                tOutIntIndices,
+                args.ignoreOutDistances);
+
+        // Copy back if necessary
+        fromDevice<int, 2>(tOutIntIndices, (int*)args.outIndices, stream);
+    } else {
+        FAISS_THROW_MSG("unknown outIndicesType");
+    }
+
+    // Copy distances back if necessary
+    fromDevice<float, 2>(tOutDistances, args.outDistances, stream);
+}
+
+void bfKnn(GpuResourcesProvider* res, const GpuDistanceParams& args) {
+    // For now, both vectors and queries must be of the same data type
+    FAISS_THROW_IF_NOT_MSG(
+            args.vectorType == args.queryType,
+            "limitation: both vectorType and queryType must currently "
+            "be the same (F32 or F16");
+
+    if (args.vectorType == DistanceDataType::F32) {
+        bfKnnConvert<float>(res, args);
+    } else if (args.vectorType == DistanceDataType::F16) {
+        bfKnnConvert<half>(res, args);
+    } else {
+        FAISS_THROW_MSG("unknown vectorType");
+    }
+}
+
+// legacy version
+void bruteForceKnn(
+        GpuResourcesProvider* res,
+        faiss::MetricType metric,
+        // A region of memory size numVectors x dims, with dims
+        // innermost
+        const float* vectors,
+        bool vectorsRowMajor,
+        int numVectors,
+        // A region of memory size numQueries x dims, with dims
+        // innermost
+        const float* queries,
+        bool queriesRowMajor,
+        int numQueries,
+        int dims,
+        int k,
+        // A region of memory size numQueries x k, with k
+        // innermost
+        float* outDistances,
+        // A region of memory size numQueries x k, with k
+        // innermost
+        Index::idx_t* outIndices) {
+    std::cerr << "bruteForceKnn is deprecated; call bfKnn instead" << std::endl;
+
+    GpuDistanceParams args;
+    args.metric = metric;
+    args.k = k;
+    args.dims = dims;
+    args.vectors = vectors;
+    args.vectorsRowMajor = vectorsRowMajor;
+    args.numVectors = numVectors;
+    args.queries = queries;
+    args.queriesRowMajor = queriesRowMajor;
+    args.numQueries = numQueries;
+    args.outDistances = outDistances;
+    args.outIndices = outIndices;
+
+    bfKnn(res, args);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuDistance.cu.prehip
+++ b/faiss/gpu/GpuDistance.cu.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/impl/Distance.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+
+namespace faiss {
+namespace gpu {
+
+template <typename T>
+void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
+    // Validate the input data
+    FAISS_THROW_IF_NOT_MSG(
+            args.k > 0 || args.k == -1,
+            "bfKnn: k must be > 0 for top-k reduction, "
+            "or -1 for all pairwise distances");
+    FAISS_THROW_IF_NOT_MSG(args.dims > 0, "bfKnn: dims must be > 0");
+    FAISS_THROW_IF_NOT_MSG(
+            args.numVectors > 0, "bfKnn: numVectors must be > 0");
+    FAISS_THROW_IF_NOT_MSG(
+            args.vectors, "bfKnn: vectors must be provided (passed null)");
+    FAISS_THROW_IF_NOT_MSG(
+            args.numQueries > 0, "bfKnn: numQueries must be > 0");
+    FAISS_THROW_IF_NOT_MSG(
+            args.queries, "bfKnn: queries must be provided (passed null)");
+    FAISS_THROW_IF_NOT_MSG(
+            args.outDistances,
+            "bfKnn: outDistances must be provided (passed null)");
+    FAISS_THROW_IF_NOT_MSG(
+            args.outIndices || args.k == -1,
+            "bfKnn: outIndices must be provided (passed null)");
+
+    // Don't let the resources go out of scope
+    auto resImpl = prov->getResources();
+    auto res = resImpl.get();
+    auto device = getCurrentDevice();
+    auto stream = res->getDefaultStreamCurrentDevice();
+
+    auto tVectors = toDeviceTemporary<T, 2>(
+            res,
+            device,
+            const_cast<T*>(reinterpret_cast<const T*>(args.vectors)),
+            stream,
+            {args.vectorsRowMajor ? args.numVectors : args.dims,
+             args.vectorsRowMajor ? args.dims : args.numVectors});
+    auto tQueries = toDeviceTemporary<T, 2>(
+            res,
+            device,
+            const_cast<T*>(reinterpret_cast<const T*>(args.queries)),
+            stream,
+            {args.queriesRowMajor ? args.numQueries : args.dims,
+             args.queriesRowMajor ? args.dims : args.numQueries});
+
+    DeviceTensor<float, 1, true> tVectorNorms;
+    if (args.vectorNorms) {
+        tVectorNorms = toDeviceTemporary<float, 1>(
+                res,
+                device,
+                const_cast<float*>(args.vectorNorms),
+                stream,
+                {args.numVectors});
+    }
+
+    auto tOutDistances = toDeviceTemporary<float, 2>(
+            res,
+            device,
+            args.outDistances,
+            stream,
+            {args.numQueries, args.k == -1 ? args.numVectors : args.k});
+
+    if (args.k == -1) {
+        // Reporting all pairwise distances
+        allPairwiseDistanceOnDevice<T>(
+                res,
+                device,
+                stream,
+                tVectors,
+                args.vectorsRowMajor,
+                args.vectorNorms ? &tVectorNorms : nullptr,
+                tQueries,
+                args.queriesRowMajor,
+                args.metric,
+                args.metricArg,
+                tOutDistances);
+    } else if (args.outIndicesType == IndicesDataType::I64) {
+        // The brute-force API only supports an interface for i32 indices only,
+        // so we must create an output i32 buffer then convert back
+        DeviceTensor<int, 2, true> tOutIntIndices(
+                res,
+                makeTempAlloc(AllocType::Other, stream),
+                {args.numQueries, args.k});
+
+        // Since we've guaranteed that all arguments are on device, call the
+        // implementation
+        bfKnnOnDevice<T>(
+                res,
+                device,
+                stream,
+                tVectors,
+                args.vectorsRowMajor,
+                args.vectorNorms ? &tVectorNorms : nullptr,
+                tQueries,
+                args.queriesRowMajor,
+                args.k,
+                args.metric,
+                args.metricArg,
+                tOutDistances,
+                tOutIntIndices,
+                args.ignoreOutDistances);
+
+        // Convert and copy int indices out
+        auto tOutIndices = toDeviceTemporary<Index::idx_t, 2>(
+                res,
+                device,
+                (Index::idx_t*)args.outIndices,
+                stream,
+                {args.numQueries, args.k});
+
+        // Convert int to idx_t
+        convertTensor<int, Index::idx_t, 2>(
+                stream, tOutIntIndices, tOutIndices);
+
+        // Copy back if necessary
+        fromDevice<Index::idx_t, 2>(
+                tOutIndices, (Index::idx_t*)args.outIndices, stream);
+
+    } else if (args.outIndicesType == IndicesDataType::I32) {
+        // We can use the brute-force API directly, as it takes i32 indices
+        // FIXME: convert to int32_t everywhere?
+        static_assert(sizeof(int) == 4, "");
+
+        auto tOutIntIndices = toDeviceTemporary<int, 2>(
+                res,
+                device,
+                (int*)args.outIndices,
+                stream,
+                {args.numQueries, args.k});
+
+        // Since we've guaranteed that all arguments are on device, call the
+        // implementation
+        bfKnnOnDevice<T>(
+                res,
+                device,
+                stream,
+                tVectors,
+                args.vectorsRowMajor,
+                args.vectorNorms ? &tVectorNorms : nullptr,
+                tQueries,
+                args.queriesRowMajor,
+                args.k,
+                args.metric,
+                args.metricArg,
+                tOutDistances,
+                tOutIntIndices,
+                args.ignoreOutDistances);
+
+        // Copy back if necessary
+        fromDevice<int, 2>(tOutIntIndices, (int*)args.outIndices, stream);
+    } else {
+        FAISS_THROW_MSG("unknown outIndicesType");
+    }
+
+    // Copy distances back if necessary
+    fromDevice<float, 2>(tOutDistances, args.outDistances, stream);
+}
+
+void bfKnn(GpuResourcesProvider* res, const GpuDistanceParams& args) {
+    // For now, both vectors and queries must be of the same data type
+    FAISS_THROW_IF_NOT_MSG(
+            args.vectorType == args.queryType,
+            "limitation: both vectorType and queryType must currently "
+            "be the same (F32 or F16");
+
+    if (args.vectorType == DistanceDataType::F32) {
+        bfKnnConvert<float>(res, args);
+    } else if (args.vectorType == DistanceDataType::F16) {
+        bfKnnConvert<half>(res, args);
+    } else {
+        FAISS_THROW_MSG("unknown vectorType");
+    }
+}
+
+// legacy version
+void bruteForceKnn(
+        GpuResourcesProvider* res,
+        faiss::MetricType metric,
+        // A region of memory size numVectors x dims, with dims
+        // innermost
+        const float* vectors,
+        bool vectorsRowMajor,
+        int numVectors,
+        // A region of memory size numQueries x dims, with dims
+        // innermost
+        const float* queries,
+        bool queriesRowMajor,
+        int numQueries,
+        int dims,
+        int k,
+        // A region of memory size numQueries x k, with k
+        // innermost
+        float* outDistances,
+        // A region of memory size numQueries x k, with k
+        // innermost
+        Index::idx_t* outIndices) {
+    std::cerr << "bruteForceKnn is deprecated; call bfKnn instead" << std::endl;
+
+    GpuDistanceParams args;
+    args.metric = metric;
+    args.k = k;
+    args.dims = dims;
+    args.vectors = vectors;
+    args.vectorsRowMajor = vectorsRowMajor;
+    args.numVectors = numVectors;
+    args.queries = queries;
+    args.queriesRowMajor = queriesRowMajor;
+    args.numQueries = numQueries;
+    args.outDistances = outDistances;
+    args.outIndices = outIndices;
+
+    bfKnn(res, args);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuDistance.h
+++ b/faiss/gpu/GpuDistance.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+
+namespace faiss {
+namespace gpu {
+
+class GpuResourcesProvider;
+
+// Scalar type of the vector data
+enum class DistanceDataType {
+    F32 = 1,
+    F16,
+};
+
+// Scalar type of the indices data
+enum class IndicesDataType {
+    I64 = 1,
+    I32,
+};
+
+/// Arguments to brute-force GPU k-nearest neighbor searching
+struct GpuDistanceParams {
+    GpuDistanceParams()
+            : metric(faiss::MetricType::METRIC_L2),
+              metricArg(0),
+              k(0),
+              dims(0),
+              vectors(nullptr),
+              vectorType(DistanceDataType::F32),
+              vectorsRowMajor(true),
+              numVectors(0),
+              vectorNorms(nullptr),
+              queries(nullptr),
+              queryType(DistanceDataType::F32),
+              queriesRowMajor(true),
+              numQueries(0),
+              outDistances(nullptr),
+              ignoreOutDistances(false),
+              outIndicesType(IndicesDataType::I64),
+              outIndices(nullptr) {}
+
+    //
+    // Search parameters
+    //
+
+    /// Search parameter: distance metric
+    faiss::MetricType metric;
+
+    /// Search parameter: distance metric argument (if applicable)
+    /// For metric == METRIC_Lp, this is the p-value
+    float metricArg;
+
+    /// Search parameter: return k nearest neighbors
+    /// If the value provided is -1, then we report all pairwise distances
+    /// without top-k filtering
+    int k;
+
+    /// Vector dimensionality
+    int dims;
+
+    //
+    // Vectors being queried
+    //
+
+    /// If vectorsRowMajor is true, this is
+    /// numVectors x dims, with dims innermost; otherwise,
+    /// dims x numVectors, with numVectors innermost
+    const void* vectors;
+    DistanceDataType vectorType;
+    bool vectorsRowMajor;
+    int numVectors;
+
+    /// Precomputed L2 norms for each vector in `vectors`, which can be
+    /// optionally provided in advance to speed computation for METRIC_L2
+    const float* vectorNorms;
+
+    //
+    // The query vectors (i.e., find k-nearest neighbors in `vectors` for each
+    // of the `queries`
+    //
+
+    /// If queriesRowMajor is true, this is
+    /// numQueries x dims, with dims innermost; otherwise,
+    /// dims x numQueries, with numQueries innermost
+    const void* queries;
+    DistanceDataType queryType;
+    bool queriesRowMajor;
+    int numQueries;
+
+    //
+    // Output results
+    //
+
+    /// A region of memory size numQueries x k, with k
+    /// innermost (row major) if k > 0, or if k == -1, a region of memory of
+    /// size numQueries x numVectors
+    float* outDistances;
+
+    /// Do we only care about the indices reported, rather than the output
+    /// distances? Not used if k == -1 (all pairwise distances)
+    bool ignoreOutDistances;
+
+    /// A region of memory size numQueries x k, with k
+    /// innermost (row major). Not used if k == -1 (all pairwise distances)
+    IndicesDataType outIndicesType;
+    void* outIndices;
+};
+
+/// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
+/// neighbor searches on an externally-provided region of memory (e.g., from a
+/// pytorch tensor).
+/// The data (vectors, queries, outDistances, outIndices) can be resident on the
+/// GPU or the CPU, but all calculations are performed on the GPU. If the result
+/// buffers are on the CPU, results will be copied back when done.
+///
+/// All GPU computation is performed on the current CUDA device, and ordered
+/// with respect to resources->getDefaultStreamCurrentDevice().
+///
+/// For each vector in `queries`, searches all of `vectors` to find its k
+/// nearest neighbors with respect to the given metric
+void bfKnn(GpuResourcesProvider* resources, const GpuDistanceParams& args);
+
+/// Deprecated legacy implementation
+void bruteForceKnn(
+        GpuResourcesProvider* resources,
+        faiss::MetricType metric,
+        // If vectorsRowMajor is true, this is
+        // numVectors x dims, with dims innermost; otherwise,
+        // dims x numVectors, with numVectors innermost
+        const float* vectors,
+        bool vectorsRowMajor,
+        int numVectors,
+        // If queriesRowMajor is true, this is
+        // numQueries x dims, with dims innermost; otherwise,
+        // dims x numQueries, with numQueries innermost
+        const float* queries,
+        bool queriesRowMajor,
+        int numQueries,
+        int dims,
+        int k,
+        // A region of memory size numQueries x k, with k
+        // innermost (row major)
+        float* outDistances,
+        // A region of memory size numQueries x k, with k
+        // innermost (row major)
+        Index::idx_t* outIndices);
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuDistance.h.prehip
+++ b/faiss/gpu/GpuDistance.h.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+
+namespace faiss {
+namespace gpu {
+
+class GpuResourcesProvider;
+
+// Scalar type of the vector data
+enum class DistanceDataType {
+    F32 = 1,
+    F16,
+};
+
+// Scalar type of the indices data
+enum class IndicesDataType {
+    I64 = 1,
+    I32,
+};
+
+/// Arguments to brute-force GPU k-nearest neighbor searching
+struct GpuDistanceParams {
+    GpuDistanceParams()
+            : metric(faiss::MetricType::METRIC_L2),
+              metricArg(0),
+              k(0),
+              dims(0),
+              vectors(nullptr),
+              vectorType(DistanceDataType::F32),
+              vectorsRowMajor(true),
+              numVectors(0),
+              vectorNorms(nullptr),
+              queries(nullptr),
+              queryType(DistanceDataType::F32),
+              queriesRowMajor(true),
+              numQueries(0),
+              outDistances(nullptr),
+              ignoreOutDistances(false),
+              outIndicesType(IndicesDataType::I64),
+              outIndices(nullptr) {}
+
+    //
+    // Search parameters
+    //
+
+    /// Search parameter: distance metric
+    faiss::MetricType metric;
+
+    /// Search parameter: distance metric argument (if applicable)
+    /// For metric == METRIC_Lp, this is the p-value
+    float metricArg;
+
+    /// Search parameter: return k nearest neighbors
+    /// If the value provided is -1, then we report all pairwise distances
+    /// without top-k filtering
+    int k;
+
+    /// Vector dimensionality
+    int dims;
+
+    //
+    // Vectors being queried
+    //
+
+    /// If vectorsRowMajor is true, this is
+    /// numVectors x dims, with dims innermost; otherwise,
+    /// dims x numVectors, with numVectors innermost
+    const void* vectors;
+    DistanceDataType vectorType;
+    bool vectorsRowMajor;
+    int numVectors;
+
+    /// Precomputed L2 norms for each vector in `vectors`, which can be
+    /// optionally provided in advance to speed computation for METRIC_L2
+    const float* vectorNorms;
+
+    //
+    // The query vectors (i.e., find k-nearest neighbors in `vectors` for each
+    // of the `queries`
+    //
+
+    /// If queriesRowMajor is true, this is
+    /// numQueries x dims, with dims innermost; otherwise,
+    /// dims x numQueries, with numQueries innermost
+    const void* queries;
+    DistanceDataType queryType;
+    bool queriesRowMajor;
+    int numQueries;
+
+    //
+    // Output results
+    //
+
+    /// A region of memory size numQueries x k, with k
+    /// innermost (row major) if k > 0, or if k == -1, a region of memory of
+    /// size numQueries x numVectors
+    float* outDistances;
+
+    /// Do we only care about the indices reported, rather than the output
+    /// distances? Not used if k == -1 (all pairwise distances)
+    bool ignoreOutDistances;
+
+    /// A region of memory size numQueries x k, with k
+    /// innermost (row major). Not used if k == -1 (all pairwise distances)
+    IndicesDataType outIndicesType;
+    void* outIndices;
+};
+
+/// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
+/// neighbor searches on an externally-provided region of memory (e.g., from a
+/// pytorch tensor).
+/// The data (vectors, queries, outDistances, outIndices) can be resident on the
+/// GPU or the CPU, but all calculations are performed on the GPU. If the result
+/// buffers are on the CPU, results will be copied back when done.
+///
+/// All GPU computation is performed on the current CUDA device, and ordered
+/// with respect to resources->getDefaultStreamCurrentDevice().
+///
+/// For each vector in `queries`, searches all of `vectors` to find its k
+/// nearest neighbors with respect to the given metric
+void bfKnn(GpuResourcesProvider* resources, const GpuDistanceParams& args);
+
+/// Deprecated legacy implementation
+void bruteForceKnn(
+        GpuResourcesProvider* resources,
+        faiss::MetricType metric,
+        // If vectorsRowMajor is true, this is
+        // numVectors x dims, with dims innermost; otherwise,
+        // dims x numVectors, with numVectors innermost
+        const float* vectors,
+        bool vectorsRowMajor,
+        int numVectors,
+        // If queriesRowMajor is true, this is
+        // numQueries x dims, with dims innermost; otherwise,
+        // dims x numQueries, with numQueries innermost
+        const float* queries,
+        bool queriesRowMajor,
+        int numQueries,
+        int dims,
+        int k,
+        // A region of memory size numQueries x k, with k
+        // innermost (row major)
+        float* outDistances,
+        // A region of memory size numQueries x k, with k
+        // innermost (row major)
+        Index::idx_t* outIndices);
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuFaissAssert.h
+++ b/faiss/gpu/GpuFaissAssert.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef GPU_FAISS_ASSERT_INCLUDED
+#define GPU_FAISS_ASSERT_INCLUDED
+
+#include <hip/hip_runtime.h>
+#include <faiss/impl/FaissAssert.h>
+
+///
+/// Assertions
+///
+
+#ifdef __CUDA_ARCH__
+#define GPU_FAISS_ASSERT(X) assert(X)
+#define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
+#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)
+#else
+#define GPU_FAISS_ASSERT(X) FAISS_ASSERT(X)
+#define GPU_FAISS_ASSERT_MSG(X, MSG) FAISS_ASSERT_MSG(X, MSG)
+#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) FAISS_ASSERT_FMT(X, FMT, __VA_ARGS)
+#endif // __CUDA_ARCH__
+
+#endif
--- a/faiss/gpu/GpuFaissAssert.h.prehip
+++ b/faiss/gpu/GpuFaissAssert.h.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef GPU_FAISS_ASSERT_INCLUDED
+#define GPU_FAISS_ASSERT_INCLUDED
+
+#include <cuda.h>
+#include <faiss/impl/FaissAssert.h>
+
+///
+/// Assertions
+///
+
+#ifdef __CUDA_ARCH__
+#define GPU_FAISS_ASSERT(X) assert(X)
+#define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
+#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)
+#else
+#define GPU_FAISS_ASSERT(X) FAISS_ASSERT(X)
+#define GPU_FAISS_ASSERT_MSG(X, MSG) FAISS_ASSERT_MSG(X, MSG)
+#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) FAISS_ASSERT_FMT(X, FMT, __VA_ARGS)
+#endif // __CUDA_ARCH__
+
+#endif
--- a/faiss/gpu/GpuIcmEncoder.cu
+++ b/faiss/gpu/GpuIcmEncoder.cu
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuIcmEncoder.h>
+
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/utils/WorkerThread.h>
+#include <faiss/gpu/impl/IcmEncoder.cuh>
+
+#include <algorithm>
+
+namespace faiss {
+namespace gpu {
+
+///< A helper structure to support multi-GPU
+struct IcmEncoderShards {
+    std::vector<std::pair<
+            std::unique_ptr<IcmEncoderImpl>,
+            std::unique_ptr<WorkerThread>>>
+            workers;
+
+    void add(IcmEncoderImpl* encoder) {
+        workers.emplace_back(std::make_pair(
+                std::unique_ptr<IcmEncoderImpl>(encoder),
+                std::unique_ptr<WorkerThread>(new WorkerThread)));
+    }
+
+    IcmEncoderImpl* at(int idx) {
+        return workers[idx].first.get();
+    }
+
+    ///< call f(idx, encoder) for each encoder
+    void runOnShards(std::function<void(int, IcmEncoderImpl*)> f) {
+        std::vector<std::future<bool>> v;
+
+        for (int i = 0; i < this->workers.size(); ++i) {
+            auto& p = this->workers[i];
+            auto encoder = p.first.get();
+            v.emplace_back(p.second->add([f, i, encoder]() { f(i, encoder); }));
+        }
+
+        for (int i = 0; i < v.size(); ++i) {
+            auto& fut = v[i];
+            fut.get(); // no exception handle, crash if any thread down
+        }
+    }
+
+    size_t size() {
+        return workers.size();
+    }
+};
+
+GpuIcmEncoder::GpuIcmEncoder(
+        const LocalSearchQuantizer* lsq,
+        const std::vector<GpuResourcesProvider*>& provs,
+        const std::vector<int>& devices)
+        : lsq::IcmEncoder(lsq), shards(new IcmEncoderShards()) {
+    // create an IcmEncoderImpl instance for each device.
+    for (size_t i = 0; i < provs.size(); i++) {
+        shards->add(new IcmEncoderImpl(
+                lsq->M, lsq->K, lsq->d, provs[i], devices[i]));
+    }
+}
+
+GpuIcmEncoder::~GpuIcmEncoder() {}
+
+void GpuIcmEncoder::set_binary_term() {
+    auto fn = [=](int idx, IcmEncoderImpl* encoder) {
+        encoder->setBinaryTerm(lsq->codebooks.data());
+    };
+    shards->runOnShards(fn);
+}
+
+void GpuIcmEncoder::encode(
+        int32_t* codes,
+        const float* x,
+        std::mt19937& gen,
+        size_t n,
+        size_t ils_iters) const {
+    size_t nshards = shards->size();
+    size_t shard_size = (n + nshards - 1) / nshards;
+
+    auto codebooks = lsq->codebooks.data();
+    auto M = lsq->M;
+    auto d = lsq->d;
+    auto nperts = lsq->nperts;
+    auto icm_iters = lsq->icm_iters;
+
+    auto seed = gen();
+
+    // split input data
+    auto fn = [=](int idx, IcmEncoderImpl* encoder) {
+        size_t i0 = idx * shard_size;
+        size_t ni = std::min(shard_size, n - i0);
+        auto xi = x + i0 * d;
+        auto ci = codes + i0 * M;
+        std::mt19937 geni(idx + seed); // different seed for each shard
+        encoder->encode(
+                ci, xi, codebooks, geni, ni, nperts, ils_iters, icm_iters);
+    };
+    shards->runOnShards(fn);
+}
+
+GpuIcmEncoderFactory::GpuIcmEncoderFactory(int ngpus) {
+    for (int i = 0; i < ngpus; i++) {
+        provs.push_back(new StandardGpuResources());
+        devices.push_back(i);
+    }
+}
+
+lsq::IcmEncoder* GpuIcmEncoderFactory::get(const LocalSearchQuantizer* lsq) {
+    return new GpuIcmEncoder(lsq, provs, devices);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIcmEncoder.cu.prehip
+++ b/faiss/gpu/GpuIcmEncoder.cu.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuIcmEncoder.h>
+
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/utils/WorkerThread.h>
+#include <faiss/gpu/impl/IcmEncoder.cuh>
+
+#include <algorithm>
+
+namespace faiss {
+namespace gpu {
+
+///< A helper structure to support multi-GPU
+struct IcmEncoderShards {
+    std::vector<std::pair<
+            std::unique_ptr<IcmEncoderImpl>,
+            std::unique_ptr<WorkerThread>>>
+            workers;
+
+    void add(IcmEncoderImpl* encoder) {
+        workers.emplace_back(std::make_pair(
+                std::unique_ptr<IcmEncoderImpl>(encoder),
+                std::unique_ptr<WorkerThread>(new WorkerThread)));
+    }
+
+    IcmEncoderImpl* at(int idx) {
+        return workers[idx].first.get();
+    }
+
+    ///< call f(idx, encoder) for each encoder
+    void runOnShards(std::function<void(int, IcmEncoderImpl*)> f) {
+        std::vector<std::future<bool>> v;
+
+        for (int i = 0; i < this->workers.size(); ++i) {
+            auto& p = this->workers[i];
+            auto encoder = p.first.get();
+            v.emplace_back(p.second->add([f, i, encoder]() { f(i, encoder); }));
+        }
+
+        for (int i = 0; i < v.size(); ++i) {
+            auto& fut = v[i];
+            fut.get(); // no exception handle, crash if any thread down
+        }
+    }
+
+    size_t size() {
+        return workers.size();
+    }
+};
+
+GpuIcmEncoder::GpuIcmEncoder(
+        const LocalSearchQuantizer* lsq,
+        const std::vector<GpuResourcesProvider*>& provs,
+        const std::vector<int>& devices)
+        : lsq::IcmEncoder(lsq), shards(new IcmEncoderShards()) {
+    // create an IcmEncoderImpl instance for each device.
+    for (size_t i = 0; i < provs.size(); i++) {
+        shards->add(new IcmEncoderImpl(
+                lsq->M, lsq->K, lsq->d, provs[i], devices[i]));
+    }
+}
+
+GpuIcmEncoder::~GpuIcmEncoder() {}
+
+void GpuIcmEncoder::set_binary_term() {
+    auto fn = [=](int idx, IcmEncoderImpl* encoder) {
+        encoder->setBinaryTerm(lsq->codebooks.data());
+    };
+    shards->runOnShards(fn);
+}
+
+void GpuIcmEncoder::encode(
+        int32_t* codes,
+        const float* x,
+        std::mt19937& gen,
+        size_t n,
+        size_t ils_iters) const {
+    size_t nshards = shards->size();
+    size_t shard_size = (n + nshards - 1) / nshards;
+
+    auto codebooks = lsq->codebooks.data();
+    auto M = lsq->M;
+    auto d = lsq->d;
+    auto nperts = lsq->nperts;
+    auto icm_iters = lsq->icm_iters;
+
+    auto seed = gen();
+
+    // split input data
+    auto fn = [=](int idx, IcmEncoderImpl* encoder) {
+        size_t i0 = idx * shard_size;
+        size_t ni = std::min(shard_size, n - i0);
+        auto xi = x + i0 * d;
+        auto ci = codes + i0 * M;
+        std::mt19937 geni(idx + seed); // different seed for each shard
+        encoder->encode(
+                ci, xi, codebooks, geni, ni, nperts, ils_iters, icm_iters);
+    };
+    shards->runOnShards(fn);
+}
+
+GpuIcmEncoderFactory::GpuIcmEncoderFactory(int ngpus) {
+    for (int i = 0; i < ngpus; i++) {
+        provs.push_back(new StandardGpuResources());
+        devices.push_back(i);
+    }
+}
+
+lsq::IcmEncoder* GpuIcmEncoderFactory::get(const LocalSearchQuantizer* lsq) {
+    return new GpuIcmEncoder(lsq, provs, devices);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIcmEncoder.h
+++ b/faiss/gpu/GpuIcmEncoder.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/LocalSearchQuantizer.h>
+
+#include <memory>
+
+namespace faiss {
+namespace gpu {
+
+class GpuResourcesProvider;
+struct IcmEncoderShards;
+
+/** Perform LSQ encoding on GPU.
+ *
+ * Split input vectors to different devices and call IcmEncoderImpl::encode
+ * to encode them
+ */
+class GpuIcmEncoder : public lsq::IcmEncoder {
+   public:
+    GpuIcmEncoder(
+            const LocalSearchQuantizer* lsq,
+            const std::vector<GpuResourcesProvider*>& provs,
+            const std::vector<int>& devices);
+
+    ~GpuIcmEncoder();
+
+    GpuIcmEncoder(const GpuIcmEncoder&) = delete;
+    GpuIcmEncoder& operator=(const GpuIcmEncoder&) = delete;
+
+    void set_binary_term() override;
+
+    void encode(
+            int32_t* codes,
+            const float* x,
+            std::mt19937& gen,
+            size_t n,
+            size_t ils_iters) const override;
+
+   private:
+    std::unique_ptr<IcmEncoderShards> shards;
+};
+
+struct GpuIcmEncoderFactory : public lsq::IcmEncoderFactory {
+    explicit GpuIcmEncoderFactory(int ngpus = 1);
+
+    lsq::IcmEncoder* get(const LocalSearchQuantizer* lsq) override;
+
+    std::vector<GpuResourcesProvider*> provs;
+    std::vector<int> devices;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIcmEncoder.h.prehip
+++ b/faiss/gpu/GpuIcmEncoder.h.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/LocalSearchQuantizer.h>
+
+#include <memory>
+
+namespace faiss {
+namespace gpu {
+
+class GpuResourcesProvider;
+struct IcmEncoderShards;
+
+/** Perform LSQ encoding on GPU.
+ *
+ * Split input vectors to different devices and call IcmEncoderImpl::encode
+ * to encode them
+ */
+class GpuIcmEncoder : public lsq::IcmEncoder {
+   public:
+    GpuIcmEncoder(
+            const LocalSearchQuantizer* lsq,
+            const std::vector<GpuResourcesProvider*>& provs,
+            const std::vector<int>& devices);
+
+    ~GpuIcmEncoder();
+
+    GpuIcmEncoder(const GpuIcmEncoder&) = delete;
+    GpuIcmEncoder& operator=(const GpuIcmEncoder&) = delete;
+
+    void set_binary_term() override;
+
+    void encode(
+            int32_t* codes,
+            const float* x,
+            std::mt19937& gen,
+            size_t n,
+            size_t ils_iters) const override;
+
+   private:
+    std::unique_ptr<IcmEncoderShards> shards;
+};
+
+struct GpuIcmEncoderFactory : public lsq::IcmEncoderFactory {
+    explicit GpuIcmEncoderFactory(int ngpus = 1);
+
+    lsq::IcmEncoder* get(const LocalSearchQuantizer* lsq) override;
+
+    std::vector<GpuResourcesProvider*> provs;
+    std::vector<int> devices;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+
+namespace faiss {
+namespace gpu {
+
+/// Default CPU search size for which we use paged copies
+constexpr size_t kMinPageSize = (size_t)256 * 1024 * 1024;
+
+/// Size above which we page copies from the CPU to GPU (non-paged
+/// memory usage)
+constexpr size_t kNonPinnedPageSize = (size_t)256 * 1024 * 1024;
+
+// Default size for which we page add or search
+constexpr size_t kAddPageSize = (size_t)256 * 1024 * 1024;
+
+// Or, maximum number of vectors to consider per page of add or search
+constexpr size_t kAddVecSize = (size_t)512 * 1024;
+
+// Use a smaller search size, as precomputed code usage on IVFPQ
+// requires substantial amounts of memory
+// FIXME: parameterize based on algorithm need
+constexpr size_t kSearchVecSize = (size_t)32 * 1024;
+
+GpuIndex::GpuIndex(
+        std::shared_ptr<GpuResources> resources,
+        int dims,
+        faiss::MetricType metric,
+        float metricArg,
+        GpuIndexConfig config)
+        : Index(dims, metric),
+          resources_(resources),
+          config_(config),
+          minPagedSize_(kMinPageSize) {
+    FAISS_THROW_IF_NOT_FMT(
+            config_.device < getNumDevices(),
+            "Invalid GPU device %d",
+            config_.device);
+
+    FAISS_THROW_IF_NOT_MSG(dims > 0, "Invalid number of dimensions");
+
+    FAISS_THROW_IF_NOT_FMT(
+            config_.memorySpace == MemorySpace::Device ||
+                    (config_.memorySpace == MemorySpace::Unified &&
+                     getFullUnifiedMemSupport(config_.device)),
+            "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",
+            config.device);
+
+    metric_arg = metricArg;
+
+    FAISS_ASSERT((bool)resources_);
+    resources_->initializeForDevice(config_.device);
+}
+
+int GpuIndex::getDevice() const {
+    return config_.device;
+}
+
+void GpuIndex::copyFrom(const faiss::Index* index) {
+    d = index->d;
+    metric_type = index->metric_type;
+    metric_arg = index->metric_arg;
+    ntotal = index->ntotal;
+    is_trained = index->is_trained;
+}
+
+void GpuIndex::copyTo(faiss::Index* index) const {
+    index->d = d;
+    index->metric_type = metric_type;
+    index->metric_arg = metric_arg;
+    index->ntotal = ntotal;
+    index->is_trained = is_trained;
+}
+
+void GpuIndex::setMinPagingSize(size_t size) {
+    minPagedSize_ = size;
+}
+
+size_t GpuIndex::getMinPagingSize() const {
+    return minPagedSize_;
+}
+
+void GpuIndex::add(Index::idx_t n, const float* x) {
+    // Pass to add_with_ids
+    add_with_ids(n, x, nullptr);
+}
+
+void GpuIndex::add_with_ids(
+        Index::idx_t n,
+        const float* x,
+        const Index::idx_t* ids) {
+    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
+
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %d indices",
+            std::numeric_limits<int>::max());
+
+    if (n == 0) {
+        // nothing to add
+        return;
+    }
+
+    std::vector<Index::idx_t> generatedIds;
+
+    // Generate IDs if we need them
+    if (!ids && addImplRequiresIDs_()) {
+        generatedIds = std::vector<Index::idx_t>(n);
+
+        for (Index::idx_t i = 0; i < n; ++i) {
+            generatedIds[i] = this->ntotal + i;
+        }
+    }
+
+    DeviceScope scope(config_.device);
+    addPaged_((int)n, x, ids ? ids : generatedIds.data());
+}
+
+void GpuIndex::addPaged_(int n, const float* x, const Index::idx_t* ids) {
+    if (n > 0) {
+        size_t totalSize = (size_t)n * this->d * sizeof(float);
+
+        if (totalSize > kAddPageSize || n > kAddVecSize) {
+            // How many vectors fit into kAddPageSize?
+            size_t maxNumVecsForPageSize =
+                    kAddPageSize / ((size_t)this->d * sizeof(float));
+
+            // Always add at least 1 vector, if we have huge vectors
+            maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (size_t)1);
+
+            size_t tileSize = std::min((size_t)n, maxNumVecsForPageSize);
+            tileSize = std::min(tileSize, kSearchVecSize);
+
+            for (size_t i = 0; i < (size_t)n; i += tileSize) {
+                size_t curNum = std::min(tileSize, n - i);
+
+                addPage_(
+                        curNum,
+                        x + i * (size_t)this->d,
+                        ids ? ids + i : nullptr);
+            }
+        } else {
+            addPage_(n, x, ids);
+        }
+    }
+}
+
+void GpuIndex::addPage_(int n, const float* x, const Index::idx_t* ids) {
+    // At this point, `x` can be resident on CPU or GPU, and `ids` may be
+    // resident on CPU, GPU or may be null.
+    //
+    // Before continuing, we guarantee that all data will be resident on the
+    // GPU.
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    auto vecs = toDeviceTemporary<float, 2>(
+            resources_.get(),
+            config_.device,
+            const_cast<float*>(x),
+            stream,
+            {n, this->d});
+
+    if (ids) {
+        auto indices = toDeviceTemporary<Index::idx_t, 1>(
+                resources_.get(),
+                config_.device,
+                const_cast<Index::idx_t*>(ids),
+                stream,
+                {n});
+
+        addImpl_(n, vecs.data(), ids ? indices.data() : nullptr);
+    } else {
+        addImpl_(n, vecs.data(), nullptr);
+    }
+}
+
+void GpuIndex::assign(
+        Index::idx_t n,
+        const float* x,
+        Index::idx_t* labels,
+        Index::idx_t k) const {
+    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
+
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %d indices",
+            std::numeric_limits<int>::max());
+
+    // Maximum k-selection supported is based on the CUDA SDK
+    FAISS_THROW_IF_NOT_FMT(
+            k <= (Index::idx_t)getMaxKSelection(),
+            "GPU index only supports k <= %d (requested %d)",
+            getMaxKSelection(),
+            (int)k); // select limitation
+
+    DeviceScope scope(config_.device);
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    // We need to create a throw-away buffer for distances, which we don't use
+    // but which we do need for the search call
+    DeviceTensor<float, 2, true> distances(
+            resources_.get(),
+            makeTempAlloc(AllocType::Other, stream),
+            {(int)n, (int)k});
+
+    // Forward to search
+    search(n, x, k, distances.data(), labels);
+}
+
+void GpuIndex::search(
+        Index::idx_t n,
+        const float* x,
+        Index::idx_t k,
+        float* distances,
+        Index::idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+
+    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
+
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %d indices",
+            std::numeric_limits<int>::max());
+
+    // Maximum k-selection supported is based on the CUDA SDK
+    FAISS_THROW_IF_NOT_FMT(
+            k <= (Index::idx_t)getMaxKSelection(),
+            "GPU index only supports k <= %d (requested %d)",
+            getMaxKSelection(),
+            (int)k); // select limitation
+
+    if (n == 0 || k == 0) {
+        // nothing to search
+        return;
+    }
+
+    DeviceScope scope(config_.device);
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    // We guarantee that the searchImpl_ will be called with device-resident
+    // pointers.
+
+    // The input vectors may be too large for the GPU, but we still
+    // assume that the output distances and labels are not.
+    // Go ahead and make space for output distances and labels on the
+    // GPU.
+    // If we reach a point where all inputs are too big, we can add
+    // another level of tiling.
+    auto outDistances = toDeviceTemporary<float, 2>(
+            resources_.get(),
+            config_.device,
+            distances,
+            stream,
+            {(int)n, (int)k});
+
+    auto outLabels = toDeviceTemporary<Index::idx_t, 2>(
+            resources_.get(), config_.device, labels, stream, {(int)n, (int)k});
+
+    bool usePaged = false;
+
+    if (getDeviceForAddress(x) == -1) {
+        // It is possible that the user is querying for a vector set size
+        // `x` that won't fit on the GPU.
+        // In this case, we will have to handle paging of the data from CPU
+        // -> GPU.
+        // Currently, we don't handle the case where the output data won't
+        // fit on the GPU (e.g., n * k is too large for the GPU memory).
+        size_t dataSize = (size_t)n * this->d * sizeof(float);
+
+        if (dataSize >= minPagedSize_) {
+            searchFromCpuPaged_(n, x, k, outDistances.data(), outLabels.data());
+            usePaged = true;
+        }
+    }
+
+    if (!usePaged) {
+        searchNonPaged_(n, x, k, outDistances.data(), outLabels.data());
+    }
+
+    // Copy back if necessary
+    fromDevice<float, 2>(outDistances, distances, stream);
+    fromDevice<Index::idx_t, 2>(outLabels, labels, stream);
+}
+
+void GpuIndex::searchNonPaged_(
+        int n,
+        const float* x,
+        int k,
+        float* outDistancesData,
+        Index::idx_t* outIndicesData) const {
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    // Make sure arguments are on the device we desire; use temporary
+    // memory allocations to move it if necessary
+    auto vecs = toDeviceTemporary<float, 2>(
+            resources_.get(),
+            config_.device,
+            const_cast<float*>(x),
+            stream,
+            {n, (int)this->d});
+
+    searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData);
+}
+
+void GpuIndex::searchFromCpuPaged_(
+        int n,
+        const float* x,
+        int k,
+        float* outDistancesData,
+        Index::idx_t* outIndicesData) const {
+    Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
+    Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});
+
+    // Is pinned memory available?
+    auto pinnedAlloc = resources_->getPinnedMemory();
+    int pageSizeInVecs =
+            (int)((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
+
+    if (!pinnedAlloc.first || pageSizeInVecs < 1) {
+        // Just page without overlapping copy with compute
+        int batchSize = utils::nextHighestPowerOf2(
+                (int)((size_t)kNonPinnedPageSize / (sizeof(float) * this->d)));
+
+        for (int cur = 0; cur < n; cur += batchSize) {
+            int num = std::min(batchSize, n - cur);
+
+            auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
+            auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
+
+            searchNonPaged_(
+                    num,
+                    x + (size_t)cur * this->d,
+                    k,
+                    outDistancesSlice.data(),
+                    outIndicesSlice.data());
+        }
+
+        return;
+    }
+
+    //
+    // Pinned memory is available, so we can overlap copy with compute.
+    // We use two pinned memory buffers, and triple-buffer the
+    // procedure:
+    //
+    // 1 CPU copy -> pinned
+    // 2 pinned copy -> GPU
+    // 3 GPU compute
+    //
+    // 1 2 3 1 2 3 ...   (pinned buf A)
+    //   1 2 3 1 2 ...   (pinned buf B)
+    //     1 2 3 1 ...   (pinned buf A)
+    // time ->
+    //
+    auto defaultStream = resources_->getDefaultStream(config_.device);
+    auto copyStream = resources_->getAsyncCopyStream(config_.device);
+
+    FAISS_ASSERT(
+            (size_t)pageSizeInVecs * this->d <=
+            (size_t)std::numeric_limits<int>::max());
+
+    float* bufPinnedA = (float*)pinnedAlloc.first;
+    float* bufPinnedB = bufPinnedA + (size_t)pageSizeInVecs * this->d;
+    float* bufPinned[2] = {bufPinnedA, bufPinnedB};
+
+    // Reserve space on the GPU for the destination of the pinned buffer
+    // copy
+    DeviceTensor<float, 2, true> bufGpuA(
+            resources_.get(),
+            makeTempAlloc(AllocType::Other, defaultStream),
+            {(int)pageSizeInVecs, (int)this->d});
+    DeviceTensor<float, 2, true> bufGpuB(
+            resources_.get(),
+            makeTempAlloc(AllocType::Other, defaultStream),
+            {(int)pageSizeInVecs, (int)this->d});
+    DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
+
+    // Copy completion events for the pinned buffers
+    std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
+
+    // Execute completion events for the GPU buffers
+    std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
+
+    // All offsets are in terms of number of vectors; they remain within
+    // int bounds (as this function only handles max in vectors)
+
+    // Current start offset for buffer 1
+    int cur1 = 0;
+    int cur1BufIndex = 0;
+
+    // Current start offset for buffer 2
+    int cur2 = -1;
+    int cur2BufIndex = 0;
+
+    // Current start offset for buffer 3
+    int cur3 = -1;
+    int cur3BufIndex = 0;
+
+    while (cur3 < n) {
+        // Start async pinned -> GPU copy first (buf 2)
+        if (cur2 != -1 && cur2 < n) {
+            // Copy pinned to GPU
+            int numToCopy = std::min(pageSizeInVecs, n - cur2);
+
+            // Make sure any previous execution has completed before continuing
+            auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
+            if (eventPrev.get()) {
+                eventPrev->streamWaitOnEvent(copyStream);
+            }
+
+            CUDA_VERIFY(hipMemcpyAsync(
+                    bufGpus[cur2BufIndex]->data(),
+                    bufPinned[cur2BufIndex],
+                    (size_t)numToCopy * this->d * sizeof(float),
+                    hipMemcpyHostToDevice,
+                    copyStream));
+
+            // Mark a completion event in this stream
+            eventPinnedCopyDone[cur2BufIndex].reset(new CudaEvent(copyStream));
+
+            // We pick up from here
+            cur3 = cur2;
+            cur2 += numToCopy;
+            cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
+        }
+
+        if (cur3 != -1 && cur3 < n) {
+            // Process on GPU
+            int numToProcess = std::min(pageSizeInVecs, n - cur3);
+
+            // Make sure the previous copy has completed before continuing
+            auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
+            FAISS_ASSERT(eventPrev.get());
+
+            eventPrev->streamWaitOnEvent(defaultStream);
+
+            // Create tensor wrappers
+            // DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
+            //                                    {numToProcess, this->d});
+            auto outDistancesSlice =
+                    outDistances.narrowOutermost(cur3, numToProcess);
+            auto outIndicesSlice =
+                    outIndices.narrowOutermost(cur3, numToProcess);
+
+            searchImpl_(
+                    numToProcess,
+                    bufGpus[cur3BufIndex]->data(),
+                    k,
+                    outDistancesSlice.data(),
+                    outIndicesSlice.data());
+
+            // Create completion event
+            eventGpuExecuteDone[cur3BufIndex].reset(
+                    new CudaEvent(defaultStream));
+
+            // We pick up from here
+            cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
+            cur3 += numToProcess;
+        }
+
+        if (cur1 < n) {
+            // Copy CPU mem to CPU pinned
+            int numToCopy = std::min(pageSizeInVecs, n - cur1);
+
+            // Make sure any previous copy has completed before continuing
+            auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
+            if (eventPrev.get()) {
+                eventPrev->cpuWaitOnEvent();
+            }
+
+            memcpy(bufPinned[cur1BufIndex],
+                   x + (size_t)cur1 * this->d,
+                   (size_t)numToCopy * this->d * sizeof(float));
+
+            // We pick up from here
+            cur2 = cur1;
+            cur1 += numToCopy;
+            cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
+        }
+    }
+}
+
+void GpuIndex::compute_residual(
+        const float* x,
+        float* residual,
+        Index::idx_t key) const {
+    FAISS_THROW_MSG("compute_residual not implemented for this type of index");
+}
+
+void GpuIndex::compute_residual_n(
+        Index::idx_t n,
+        const float* xs,
+        float* residuals,
+        const Index::idx_t* keys) const {
+    FAISS_THROW_MSG(
+            "compute_residual_n not implemented for this type of index");
+}
+
+std::shared_ptr<GpuResources> GpuIndex::getResources() {
+    return resources_;
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndex.cu.prehip
+++ b/faiss/gpu/GpuIndex.cu.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+
+namespace faiss {
+namespace gpu {
+
+/// Default CPU search size for which we use paged copies
+constexpr size_t kMinPageSize = (size_t)256 * 1024 * 1024;
+
+/// Size above which we page copies from the CPU to GPU (non-paged
+/// memory usage)
+constexpr size_t kNonPinnedPageSize = (size_t)256 * 1024 * 1024;
+
+// Default size for which we page add or search
+constexpr size_t kAddPageSize = (size_t)256 * 1024 * 1024;
+
+// Or, maximum number of vectors to consider per page of add or search
+constexpr size_t kAddVecSize = (size_t)512 * 1024;
+
+// Use a smaller search size, as precomputed code usage on IVFPQ
+// requires substantial amounts of memory
+// FIXME: parameterize based on algorithm need
+constexpr size_t kSearchVecSize = (size_t)32 * 1024;
+
+GpuIndex::GpuIndex(
+        std::shared_ptr<GpuResources> resources,
+        int dims,
+        faiss::MetricType metric,
+        float metricArg,
+        GpuIndexConfig config)
+        : Index(dims, metric),
+          resources_(resources),
+          config_(config),
+          minPagedSize_(kMinPageSize) {
+    FAISS_THROW_IF_NOT_FMT(
+            config_.device < getNumDevices(),
+            "Invalid GPU device %d",
+            config_.device);
+
+    FAISS_THROW_IF_NOT_MSG(dims > 0, "Invalid number of dimensions");
+
+    FAISS_THROW_IF_NOT_FMT(
+            config_.memorySpace == MemorySpace::Device ||
+                    (config_.memorySpace == MemorySpace::Unified &&
+                     getFullUnifiedMemSupport(config_.device)),
+            "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",
+            config.device);
+
+    metric_arg = metricArg;
+
+    FAISS_ASSERT((bool)resources_);
+    resources_->initializeForDevice(config_.device);
+}
+
+int GpuIndex::getDevice() const {
+    return config_.device;
+}
+
+void GpuIndex::copyFrom(const faiss::Index* index) {
+    d = index->d;
+    metric_type = index->metric_type;
+    metric_arg = index->metric_arg;
+    ntotal = index->ntotal;
+    is_trained = index->is_trained;
+}
+
+void GpuIndex::copyTo(faiss::Index* index) const {
+    index->d = d;
+    index->metric_type = metric_type;
+    index->metric_arg = metric_arg;
+    index->ntotal = ntotal;
+    index->is_trained = is_trained;
+}
+
+void GpuIndex::setMinPagingSize(size_t size) {
+    minPagedSize_ = size;
+}
+
+size_t GpuIndex::getMinPagingSize() const {
+    return minPagedSize_;
+}
+
+void GpuIndex::add(Index::idx_t n, const float* x) {
+    // Pass to add_with_ids
+    add_with_ids(n, x, nullptr);
+}
+
+void GpuIndex::add_with_ids(
+        Index::idx_t n,
+        const float* x,
+        const Index::idx_t* ids) {
+    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
+
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %d indices",
+            std::numeric_limits<int>::max());
+
+    if (n == 0) {
+        // nothing to add
+        return;
+    }
+
+    std::vector<Index::idx_t> generatedIds;
+
+    // Generate IDs if we need them
+    if (!ids && addImplRequiresIDs_()) {
+        generatedIds = std::vector<Index::idx_t>(n);
+
+        for (Index::idx_t i = 0; i < n; ++i) {
+            generatedIds[i] = this->ntotal + i;
+        }
+    }
+
+    DeviceScope scope(config_.device);
+    addPaged_((int)n, x, ids ? ids : generatedIds.data());
+}
+
+void GpuIndex::addPaged_(int n, const float* x, const Index::idx_t* ids) {
+    if (n > 0) {
+        size_t totalSize = (size_t)n * this->d * sizeof(float);
+
+        if (totalSize > kAddPageSize || n > kAddVecSize) {
+            // How many vectors fit into kAddPageSize?
+            size_t maxNumVecsForPageSize =
+                    kAddPageSize / ((size_t)this->d * sizeof(float));
+
+            // Always add at least 1 vector, if we have huge vectors
+            maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (size_t)1);
+
+            size_t tileSize = std::min((size_t)n, maxNumVecsForPageSize);
+            tileSize = std::min(tileSize, kSearchVecSize);
+
+            for (size_t i = 0; i < (size_t)n; i += tileSize) {
+                size_t curNum = std::min(tileSize, n - i);
+
+                addPage_(
+                        curNum,
+                        x + i * (size_t)this->d,
+                        ids ? ids + i : nullptr);
+            }
+        } else {
+            addPage_(n, x, ids);
+        }
+    }
+}
+
+void GpuIndex::addPage_(int n, const float* x, const Index::idx_t* ids) {
+    // At this point, `x` can be resident on CPU or GPU, and `ids` may be
+    // resident on CPU, GPU or may be null.
+    //
+    // Before continuing, we guarantee that all data will be resident on the
+    // GPU.
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    auto vecs = toDeviceTemporary<float, 2>(
+            resources_.get(),
+            config_.device,
+            const_cast<float*>(x),
+            stream,
+            {n, this->d});
+
+    if (ids) {
+        auto indices = toDeviceTemporary<Index::idx_t, 1>(
+                resources_.get(),
+                config_.device,
+                const_cast<Index::idx_t*>(ids),
+                stream,
+                {n});
+
+        addImpl_(n, vecs.data(), ids ? indices.data() : nullptr);
+    } else {
+        addImpl_(n, vecs.data(), nullptr);
+    }
+}
+
+void GpuIndex::assign(
+        Index::idx_t n,
+        const float* x,
+        Index::idx_t* labels,
+        Index::idx_t k) const {
+    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
+
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %d indices",
+            std::numeric_limits<int>::max());
+
+    // Maximum k-selection supported is based on the CUDA SDK
+    FAISS_THROW_IF_NOT_FMT(
+            k <= (Index::idx_t)getMaxKSelection(),
+            "GPU index only supports k <= %d (requested %d)",
+            getMaxKSelection(),
+            (int)k); // select limitation
+
+    DeviceScope scope(config_.device);
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    // We need to create a throw-away buffer for distances, which we don't use
+    // but which we do need for the search call
+    DeviceTensor<float, 2, true> distances(
+            resources_.get(),
+            makeTempAlloc(AllocType::Other, stream),
+            {(int)n, (int)k});
+
+    // Forward to search
+    search(n, x, k, distances.data(), labels);
+}
+
+void GpuIndex::search(
+        Index::idx_t n,
+        const float* x,
+        Index::idx_t k,
+        float* distances,
+        Index::idx_t* labels) const {
+    FAISS_THROW_IF_NOT(k > 0);
+
+    FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
+
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %d indices",
+            std::numeric_limits<int>::max());
+
+    // Maximum k-selection supported is based on the CUDA SDK
+    FAISS_THROW_IF_NOT_FMT(
+            k <= (Index::idx_t)getMaxKSelection(),
+            "GPU index only supports k <= %d (requested %d)",
+            getMaxKSelection(),
+            (int)k); // select limitation
+
+    if (n == 0 || k == 0) {
+        // nothing to search
+        return;
+    }
+
+    DeviceScope scope(config_.device);
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    // We guarantee that the searchImpl_ will be called with device-resident
+    // pointers.
+
+    // The input vectors may be too large for the GPU, but we still
+    // assume that the output distances and labels are not.
+    // Go ahead and make space for output distances and labels on the
+    // GPU.
+    // If we reach a point where all inputs are too big, we can add
+    // another level of tiling.
+    auto outDistances = toDeviceTemporary<float, 2>(
+            resources_.get(),
+            config_.device,
+            distances,
+            stream,
+            {(int)n, (int)k});
+
+    auto outLabels = toDeviceTemporary<Index::idx_t, 2>(
+            resources_.get(), config_.device, labels, stream, {(int)n, (int)k});
+
+    bool usePaged = false;
+
+    if (getDeviceForAddress(x) == -1) {
+        // It is possible that the user is querying for a vector set size
+        // `x` that won't fit on the GPU.
+        // In this case, we will have to handle paging of the data from CPU
+        // -> GPU.
+        // Currently, we don't handle the case where the output data won't
+        // fit on the GPU (e.g., n * k is too large for the GPU memory).
+        size_t dataSize = (size_t)n * this->d * sizeof(float);
+
+        if (dataSize >= minPagedSize_) {
+            searchFromCpuPaged_(n, x, k, outDistances.data(), outLabels.data());
+            usePaged = true;
+        }
+    }
+
+    if (!usePaged) {
+        searchNonPaged_(n, x, k, outDistances.data(), outLabels.data());
+    }
+
+    // Copy back if necessary
+    fromDevice<float, 2>(outDistances, distances, stream);
+    fromDevice<Index::idx_t, 2>(outLabels, labels, stream);
+}
+
+void GpuIndex::searchNonPaged_(
+        int n,
+        const float* x,
+        int k,
+        float* outDistancesData,
+        Index::idx_t* outIndicesData) const {
+    auto stream = resources_->getDefaultStream(config_.device);
+
+    // Make sure arguments are on the device we desire; use temporary
+    // memory allocations to move it if necessary
+    auto vecs = toDeviceTemporary<float, 2>(
+            resources_.get(),
+            config_.device,
+            const_cast<float*>(x),
+            stream,
+            {n, (int)this->d});
+
+    searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData);
+}
+
+void GpuIndex::searchFromCpuPaged_(
+        int n,
+        const float* x,
+        int k,
+        float* outDistancesData,
+        Index::idx_t* outIndicesData) const {
+    Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
+    Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});
+
+    // Is pinned memory available?
+    auto pinnedAlloc = resources_->getPinnedMemory();
+    int pageSizeInVecs =
+            (int)((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
+
+    if (!pinnedAlloc.first || pageSizeInVecs < 1) {
+        // Just page without overlapping copy with compute
+        int batchSize = utils::nextHighestPowerOf2(
+                (int)((size_t)kNonPinnedPageSize / (sizeof(float) * this->d)));
+
+        for (int cur = 0; cur < n; cur += batchSize) {
+            int num = std::min(batchSize, n - cur);
+
+            auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
+            auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
+
+            searchNonPaged_(
+                    num,
+                    x + (size_t)cur * this->d,
+                    k,
+                    outDistancesSlice.data(),
+                    outIndicesSlice.data());
+        }
+
+        return;
+    }
+
+    //
+    // Pinned memory is available, so we can overlap copy with compute.
+    // We use two pinned memory buffers, and triple-buffer the
+    // procedure:
+    //
+    // 1 CPU copy -> pinned
+    // 2 pinned copy -> GPU
+    // 3 GPU compute
+    //
+    // 1 2 3 1 2 3 ...   (pinned buf A)
+    //   1 2 3 1 2 ...   (pinned buf B)
+    //     1 2 3 1 ...   (pinned buf A)
+    // time ->
+    //
+    auto defaultStream = resources_->getDefaultStream(config_.device);
+    auto copyStream = resources_->getAsyncCopyStream(config_.device);
+
+    FAISS_ASSERT(
+            (size_t)pageSizeInVecs * this->d <=
+            (size_t)std::numeric_limits<int>::max());
+
+    float* bufPinnedA = (float*)pinnedAlloc.first;
+    float* bufPinnedB = bufPinnedA + (size_t)pageSizeInVecs * this->d;
+    float* bufPinned[2] = {bufPinnedA, bufPinnedB};
+
+    // Reserve space on the GPU for the destination of the pinned buffer
+    // copy
+    DeviceTensor<float, 2, true> bufGpuA(
+            resources_.get(),
+            makeTempAlloc(AllocType::Other, defaultStream),
+            {(int)pageSizeInVecs, (int)this->d});
+    DeviceTensor<float, 2, true> bufGpuB(
+            resources_.get(),
+            makeTempAlloc(AllocType::Other, defaultStream),
+            {(int)pageSizeInVecs, (int)this->d});
+    DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
+
+    // Copy completion events for the pinned buffers
+    std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
+
+    // Execute completion events for the GPU buffers
+    std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
+
+    // All offsets are in terms of number of vectors; they remain within
+    // int bounds (as this function only handles max in vectors)
+
+    // Current start offset for buffer 1
+    int cur1 = 0;
+    int cur1BufIndex = 0;
+
+    // Current start offset for buffer 2
+    int cur2 = -1;
+    int cur2BufIndex = 0;
+
+    // Current start offset for buffer 3
+    int cur3 = -1;
+    int cur3BufIndex = 0;
+
+    while (cur3 < n) {
+        // Start async pinned -> GPU copy first (buf 2)
+        if (cur2 != -1 && cur2 < n) {
+            // Copy pinned to GPU
+            int numToCopy = std::min(pageSizeInVecs, n - cur2);
+
+            // Make sure any previous execution has completed before continuing
+            auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
+            if (eventPrev.get()) {
+                eventPrev->streamWaitOnEvent(copyStream);
+            }
+
+            CUDA_VERIFY(cudaMemcpyAsync(
+                    bufGpus[cur2BufIndex]->data(),
+                    bufPinned[cur2BufIndex],
+                    (size_t)numToCopy * this->d * sizeof(float),
+                    cudaMemcpyHostToDevice,
+                    copyStream));
+
+            // Mark a completion event in this stream
+            eventPinnedCopyDone[cur2BufIndex].reset(new CudaEvent(copyStream));
+
+            // We pick up from here
+            cur3 = cur2;
+            cur2 += numToCopy;
+            cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
+        }
+
+        if (cur3 != -1 && cur3 < n) {
+            // Process on GPU
+            int numToProcess = std::min(pageSizeInVecs, n - cur3);
+
+            // Make sure the previous copy has completed before continuing
+            auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
+            FAISS_ASSERT(eventPrev.get());
+
+            eventPrev->streamWaitOnEvent(defaultStream);
+
+            // Create tensor wrappers
+            // DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
+            //                                    {numToProcess, this->d});
+            auto outDistancesSlice =
+                    outDistances.narrowOutermost(cur3, numToProcess);
+            auto outIndicesSlice =
+                    outIndices.narrowOutermost(cur3, numToProcess);
+
+            searchImpl_(
+                    numToProcess,
+                    bufGpus[cur3BufIndex]->data(),
+                    k,
+                    outDistancesSlice.data(),
+                    outIndicesSlice.data());
+
+            // Create completion event
+            eventGpuExecuteDone[cur3BufIndex].reset(
+                    new CudaEvent(defaultStream));
+
+            // We pick up from here
+            cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
+            cur3 += numToProcess;
+        }
+
+        if (cur1 < n) {
+            // Copy CPU mem to CPU pinned
+            int numToCopy = std::min(pageSizeInVecs, n - cur1);
+
+            // Make sure any previous copy has completed before continuing
+            auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
+            if (eventPrev.get()) {
+                eventPrev->cpuWaitOnEvent();
+            }
+
+            memcpy(bufPinned[cur1BufIndex],
+                   x + (size_t)cur1 * this->d,
+                   (size_t)numToCopy * this->d * sizeof(float));
+
+            // We pick up from here
+            cur2 = cur1;
+            cur1 += numToCopy;
+            cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
+        }
+    }
+}
+
+void GpuIndex::compute_residual(
+        const float* x,
+        float* residual,
+        Index::idx_t key) const {
+    FAISS_THROW_MSG("compute_residual not implemented for this type of index");
+}
+
+void GpuIndex::compute_residual_n(
+        Index::idx_t n,
+        const float* xs,
+        float* residuals,
+        const Index::idx_t* keys) const {
+    FAISS_THROW_MSG(
+            "compute_residual_n not implemented for this type of index");
+}
+
+std::shared_ptr<GpuResources> GpuIndex::getResources() {
+    return resources_;
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndex.h
+++ b/faiss/gpu/GpuIndex.h
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/gpu/GpuResources.h>
+
+namespace faiss {
+namespace gpu {
+
+struct GpuIndexConfig {
+    inline GpuIndexConfig() : device(0), memorySpace(MemorySpace::Device) {}
+
+    /// GPU device on which the index is resident
+    int device;
+
+    /// What memory space to use for primary storage.
+    /// On Pascal and above (CC 6+) architectures, allows GPUs to use
+    /// more memory than is available on the GPU.
+    MemorySpace memorySpace;
+};
+
+class GpuIndex : public faiss::Index {
+   public:
+    GpuIndex(
+            std::shared_ptr<GpuResources> resources,
+            int dims,
+            faiss::MetricType metric,
+            float metricArg,
+            GpuIndexConfig config);
+
+    /// Returns the device that this index is resident on
+    int getDevice() const;
+
+    /// Returns a reference to our GpuResources object that manages memory,
+    /// stream and handle resources on the GPU
+    std::shared_ptr<GpuResources> getResources();
+
+    /// Set the minimum data size for searches (in MiB) for which we use
+    /// CPU -> GPU paging
+    void setMinPagingSize(size_t size);
+
+    /// Returns the current minimum data size for paged searches
+    size_t getMinPagingSize() const;
+
+    /// `x` can be resident on the CPU or any GPU; copies are performed
+    /// as needed
+    /// Handles paged adds if the add set is too large; calls addInternal_
+    void add(Index::idx_t, const float* x) override;
+
+    /// `x` and `ids` can be resident on the CPU or any GPU; copies are
+    /// performed as needed
+    /// Handles paged adds if the add set is too large; calls addInternal_
+    void add_with_ids(Index::idx_t n, const float* x, const Index::idx_t* ids)
+            override;
+
+    /// `x` and `labels` can be resident on the CPU or any GPU; copies are
+    /// performed as needed
+    void assign(
+            Index::idx_t n,
+            const float* x,
+            Index::idx_t* labels,
+            Index::idx_t k = 1) const override;
+
+    /// `x`, `distances` and `labels` can be resident on the CPU or any
+    /// GPU; copies are performed as needed
+    void search(
+            Index::idx_t n,
+            const float* x,
+            Index::idx_t k,
+            float* distances,
+            Index::idx_t* labels) const override;
+
+    /// Overridden to force GPU indices to provide their own GPU-friendly
+    /// implementation
+    void compute_residual(const float* x, float* residual, Index::idx_t key)
+            const override;
+
+    /// Overridden to force GPU indices to provide their own GPU-friendly
+    /// implementation
+    void compute_residual_n(
+            Index::idx_t n,
+            const float* xs,
+            float* residuals,
+            const Index::idx_t* keys) const override;
+
+   protected:
+    /// Copy what we need from the CPU equivalent
+    void copyFrom(const faiss::Index* index);
+
+    /// Copy what we have to the CPU equivalent
+    void copyTo(faiss::Index* index) const;
+
+    /// Does addImpl_ require IDs? If so, and no IDs are provided, we will
+    /// generate them sequentially based on the order in which the IDs are added
+    virtual bool addImplRequiresIDs_() const = 0;
+
+    /// Overridden to actually perform the add
+    /// All data is guaranteed to be resident on our device
+    virtual void addImpl_(int n, const float* x, const Index::idx_t* ids) = 0;
+
+    /// Overridden to actually perform the search
+    /// All data is guaranteed to be resident on our device
+    virtual void searchImpl_(
+            int n,
+            const float* x,
+            int k,
+            float* distances,
+            Index::idx_t* labels) const = 0;
+
+   private:
+    /// Handles paged adds if the add set is too large, passes to
+    /// addImpl_ to actually perform the add for the current page
+    void addPaged_(int n, const float* x, const Index::idx_t* ids);
+
+    /// Calls addImpl_ for a single page of GPU-resident data
+    void addPage_(int n, const float* x, const Index::idx_t* ids);
+
+    /// Calls searchImpl_ for a single page of GPU-resident data
+    void searchNonPaged_(
+            int n,
+            const float* x,
+            int k,
+            float* outDistancesData,
+            Index::idx_t* outIndicesData) const;
+
+    /// Calls searchImpl_ for a single page of GPU-resident data,
+    /// handling paging of the data and copies from the CPU
+    void searchFromCpuPaged_(
+            int n,
+            const float* x,
+            int k,
+            float* outDistancesData,
+            Index::idx_t* outIndicesData) const;
+
+   protected:
+    /// Manages streams, cuBLAS handles and scratch memory for devices
+    std::shared_ptr<GpuResources> resources_;
+
+    /// Our configuration options
+    const GpuIndexConfig config_;
+
+    /// Size above which we page copies from the CPU to GPU
+    size_t minPagedSize_;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndex.h.prehip
+++ b/faiss/gpu/GpuIndex.h.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/gpu/GpuResources.h>
+
+namespace faiss {
+namespace gpu {
+
+struct GpuIndexConfig {
+    inline GpuIndexConfig() : device(0), memorySpace(MemorySpace::Device) {}
+
+    /// GPU device on which the index is resident
+    int device;
+
+    /// What memory space to use for primary storage.
+    /// On Pascal and above (CC 6+) architectures, allows GPUs to use
+    /// more memory than is available on the GPU.
+    MemorySpace memorySpace;
+};
+
+class GpuIndex : public faiss::Index {
+   public:
+    GpuIndex(
+            std::shared_ptr<GpuResources> resources,
+            int dims,
+            faiss::MetricType metric,
+            float metricArg,
+            GpuIndexConfig config);
+
+    /// Returns the device that this index is resident on
+    int getDevice() const;
+
+    /// Returns a reference to our GpuResources object that manages memory,
+    /// stream and handle resources on the GPU
+    std::shared_ptr<GpuResources> getResources();
+
+    /// Set the minimum data size for searches (in MiB) for which we use
+    /// CPU -> GPU paging
+    void setMinPagingSize(size_t size);
+
+    /// Returns the current minimum data size for paged searches
+    size_t getMinPagingSize() const;
+
+    /// `x` can be resident on the CPU or any GPU; copies are performed
+    /// as needed
+    /// Handles paged adds if the add set is too large; calls addInternal_
+    void add(Index::idx_t, const float* x) override;
+
+    /// `x` and `ids` can be resident on the CPU or any GPU; copies are
+    /// performed as needed
+    /// Handles paged adds if the add set is too large; calls addInternal_
+    void add_with_ids(Index::idx_t n, const float* x, const Index::idx_t* ids)
+            override;
+
+    /// `x` and `labels` can be resident on the CPU or any GPU; copies are
+    /// performed as needed
+    void assign(
+            Index::idx_t n,
+            const float* x,
+            Index::idx_t* labels,
+            Index::idx_t k = 1) const override;
+
+    /// `x`, `distances` and `labels` can be resident on the CPU or any
+    /// GPU; copies are performed as needed
+    void search(
+            Index::idx_t n,
+            const float* x,
+            Index::idx_t k,
+            float* distances,
+            Index::idx_t* labels) const override;
+
+    /// Overridden to force GPU indices to provide their own GPU-friendly
+    /// implementation
+    void compute_residual(const float* x, float* residual, Index::idx_t key)
+            const override;
+
+    /// Overridden to force GPU indices to provide their own GPU-friendly
+    /// implementation
+    void compute_residual_n(
+            Index::idx_t n,
+            const float* xs,
+            float* residuals,
+            const Index::idx_t* keys) const override;
+
+   protected:
+    /// Copy what we need from the CPU equivalent
+    void copyFrom(const faiss::Index* index);
+
+    /// Copy what we have to the CPU equivalent
+    void copyTo(faiss::Index* index) const;
+
+    /// Does addImpl_ require IDs? If so, and no IDs are provided, we will
+    /// generate them sequentially based on the order in which the IDs are added
+    virtual bool addImplRequiresIDs_() const = 0;
+
+    /// Overridden to actually perform the add
+    /// All data is guaranteed to be resident on our device
+    virtual void addImpl_(int n, const float* x, const Index::idx_t* ids) = 0;
+
+    /// Overridden to actually perform the search
+    /// All data is guaranteed to be resident on our device
+    virtual void searchImpl_(
+            int n,
+            const float* x,
+            int k,
+            float* distances,
+            Index::idx_t* labels) const = 0;
+
+   private:
+    /// Handles paged adds if the add set is too large, passes to
+    /// addImpl_ to actually perform the add for the current page
+    void addPaged_(int n, const float* x, const Index::idx_t* ids);
+
+    /// Calls addImpl_ for a single page of GPU-resident data
+    void addPage_(int n, const float* x, const Index::idx_t* ids);
+
+    /// Calls searchImpl_ for a single page of GPU-resident data
+    void searchNonPaged_(
+            int n,
+            const float* x,
+            int k,
+            float* outDistancesData,
+            Index::idx_t* outIndicesData) const;
+
+    /// Calls searchImpl_ for a single page of GPU-resident data,
+    /// handling paging of the data and copies from the CPU
+    void searchFromCpuPaged_(
+            int n,
+            const float* x,
+            int k,
+            float* outDistancesData,
+            Index::idx_t* outIndicesData) const;
+
+   protected:
+    /// Manages streams, cuBLAS handles and scratch memory for devices
+    std::shared_ptr<GpuResources> resources_;
+
+    /// Our configuration options
+    const GpuIndexConfig config_;
+
+    /// Size above which we page copies from the CPU to GPU
+    size_t minPagedSize_;
+};
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndexBinaryFlat.cu
+++ b/faiss/gpu/GpuIndexBinaryFlat.cu
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuIndexBinaryFlat.h>
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/impl/BinaryFlatIndex.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+
+namespace faiss {
+namespace gpu {
+
+/// Default CPU search size for which we use paged copies
+constexpr size_t kMinPageSize = (size_t)256 * 1024 * 1024;
+
+GpuIndexBinaryFlat::GpuIndexBinaryFlat(
+        GpuResourcesProvider* provider,
+        const faiss::IndexBinaryFlat* index,
+        GpuIndexBinaryFlatConfig config)
+        : IndexBinary(index->d),
+          resources_(provider->getResources()),
+          binaryFlatConfig_(config) {
+    FAISS_THROW_IF_NOT_FMT(
+            this->d % 8 == 0,
+            "vector dimension (number of bits) "
+            "must be divisible by 8 (passed %d)",
+            this->d);
+
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    copyFrom(index);
+}
+
+GpuIndexBinaryFlat::GpuIndexBinaryFlat(
+        GpuResourcesProvider* provider,
+        int dims,
+        GpuIndexBinaryFlatConfig config)
+        : IndexBinary(dims),
+          resources_(provider->getResources()),
+          binaryFlatConfig_(std::move(config)) {
+    FAISS_THROW_IF_NOT_FMT(
+            this->d % 8 == 0,
+            "vector dimension (number of bits) "
+            "must be divisible by 8 (passed %d)",
+            this->d);
+
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    // Construct index
+    DeviceScope scope(binaryFlatConfig_.device);
+    data_.reset(new BinaryFlatIndex(
+            resources_.get(), this->d, binaryFlatConfig_.memorySpace));
+}
+
+GpuIndexBinaryFlat::~GpuIndexBinaryFlat() {}
+
+int GpuIndexBinaryFlat::getDevice() const {
+    return binaryFlatConfig_.device;
+}
+
+std::shared_ptr<GpuResources> GpuIndexBinaryFlat::getResources() {
+    return resources_;
+}
+
+void GpuIndexBinaryFlat::copyFrom(const faiss::IndexBinaryFlat* index) {
+    DeviceScope scope(binaryFlatConfig_.device);
+
+    this->d = index->d;
+
+    // GPU code has 32 bit indices
+    FAISS_THROW_IF_NOT_FMT(
+            index->ntotal <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices; "
+            "attempting to copy CPU index with %zu parameters",
+            (size_t)std::numeric_limits<int>::max(),
+            (size_t)index->ntotal);
+    this->ntotal = index->ntotal;
+
+    // destroy old first before allocating new
+    data_.reset();
+    data_.reset(new BinaryFlatIndex(
+            resources_.get(), this->d, binaryFlatConfig_.memorySpace));
+
+    // The index could be empty
+    if (index->ntotal > 0) {
+        data_->add(
+                index->xb.data(),
+                index->ntotal,
+                resources_->getDefaultStream(binaryFlatConfig_.device));
+    }
+}
+
+void GpuIndexBinaryFlat::copyTo(faiss::IndexBinaryFlat* index) const {
+    DeviceScope scope(binaryFlatConfig_.device);
+
+    index->d = this->d;
+    index->ntotal = this->ntotal;
+
+    FAISS_ASSERT(data_);
+    FAISS_ASSERT(data_->getSize() == this->ntotal);
+    index->xb.resize(this->ntotal * (this->d / 8));
+
+    if (this->ntotal > 0) {
+        fromDevice(
+                data_->getVectorsRef(),
+                index->xb.data(),
+                resources_->getDefaultStream(binaryFlatConfig_.device));
+    }
+}
+
+void GpuIndexBinaryFlat::add(faiss::IndexBinary::idx_t n, const uint8_t* x) {
+    DeviceScope scope(binaryFlatConfig_.device);
+
+    // To avoid multiple re-allocations, ensure we have enough storage
+    // available
+    data_->reserve(n, resources_->getDefaultStream(binaryFlatConfig_.device));
+
+    // Due to GPU indexing in int32, we can't store more than this
+    // number of vectors on a GPU
+    FAISS_THROW_IF_NOT_FMT(
+            this->ntotal + n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices",
+            (size_t)std::numeric_limits<int>::max());
+
+    data_->add(
+            (const unsigned char*)x,
+            n,
+            resources_->getDefaultStream(binaryFlatConfig_.device));
+    this->ntotal += n;
+}
+
+void GpuIndexBinaryFlat::reset() {
+    DeviceScope scope(binaryFlatConfig_.device);
+
+    // Free the underlying memory
+    data_->reset();
+    this->ntotal = 0;
+}
+
+void GpuIndexBinaryFlat::search(
+        faiss::IndexBinary::idx_t n,
+        const uint8_t* x,
+        faiss::IndexBinary::idx_t k,
+        int32_t* distances,
+        faiss::IndexBinary::idx_t* labels) const {
+    if (n == 0) {
+        return;
+    }
+
+    FAISS_THROW_IF_NOT(k > 0);
+
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices",
+            (size_t)std::numeric_limits<int>::max());
+    FAISS_THROW_IF_NOT_FMT(
+            k <= (Index::idx_t)getMaxKSelection(),
+            "GPU only supports k <= %d (requested %d)",
+            getMaxKSelection(),
+            (int)k); // select limitation
+
+    DeviceScope scope(binaryFlatConfig_.device);
+    auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
+
+    // The input vectors may be too large for the GPU, but we still
+    // assume that the output distances and labels are not.
+    // Go ahead and make space for output distances and labels on the
+    // GPU.
+    // If we reach a point where all inputs are too big, we can add
+    // another level of tiling.
+    auto outDistances = toDeviceTemporary<int32_t, 2>(
+            resources_.get(),
+            binaryFlatConfig_.device,
+            distances,
+            stream,
+            {(int)n, (int)k});
+
+    // FlatIndex only supports an interface returning int indices
+    DeviceTensor<int, 2, true> outIntIndices(
+            resources_.get(),
+            makeTempAlloc(AllocType::Other, stream),
+            {(int)n, (int)k});
+
+    bool usePaged = false;
+
+    if (getDeviceForAddress(x) == -1) {
+        // It is possible that the user is querying for a vector set size
+        // `x` that won't fit on the GPU.
+        // In this case, we will have to handle paging of the data from CPU
+        // -> GPU.
+        // Currently, we don't handle the case where the output data won't
+        // fit on the GPU (e.g., n * k is too large for the GPU memory).
+        size_t dataSize = (size_t)n * (this->d / 8) * sizeof(uint8_t);
+
+        if (dataSize >= kMinPageSize) {
+            searchFromCpuPaged_(
+                    n, x, k, outDistances.data(), outIntIndices.data());
+            usePaged = true;
+        }
+    }
+
+    if (!usePaged) {
+        searchNonPaged_(n, x, k, outDistances.data(), outIntIndices.data());
+    }
+
+    // Convert and copy int indices out
+    auto outIndices = toDeviceTemporary<Index::idx_t, 2>(
+            resources_.get(),
+            binaryFlatConfig_.device,
+            labels,
+            stream,
+            {(int)n, (int)k});
+
+    // Convert int to idx_t
+    convertTensor<int, Index::idx_t, 2>(stream, outIntIndices, outIndices);
+
+    // Copy back if necessary
+    fromDevice<int32_t, 2>(outDistances, distances, stream);
+    fromDevice<Index::idx_t, 2>(outIndices, labels, stream);
+}
+
+void GpuIndexBinaryFlat::searchNonPaged_(
+        int n,
+        const uint8_t* x,
+        int k,
+        int32_t* outDistancesData,
+        int* outIndicesData) const {
+    Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
+    Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
+
+    auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
+
+    // Make sure arguments are on the device we desire; use temporary
+    // memory allocations to move it if necessary
+    auto vecs = toDeviceTemporary<uint8_t, 2>(
+            resources_.get(),
+            binaryFlatConfig_.device,
+            const_cast<uint8_t*>(x),
+            stream,
+            {n, (int)(this->d / 8)});
+
+    data_->query(vecs, k, outDistances, outIndices);
+}
+
+void GpuIndexBinaryFlat::searchFromCpuPaged_(
+        int n,
+        const uint8_t* x,
+        int k,
+        int32_t* outDistancesData,
+        int* outIndicesData) const {
+    Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
+    Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
+
+    auto vectorSize = sizeof(uint8_t) * (this->d / 8);
+
+    // Just page without overlapping copy with compute (as GpuIndexFlat does)
+    int batchSize = utils::nextHighestPowerOf2(
+            (int)((size_t)kMinPageSize / vectorSize));
+
+    for (int cur = 0; cur < n; cur += batchSize) {
+        int num = std::min(batchSize, n - cur);
+
+        auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
+        auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
+
+        searchNonPaged_(
+                num,
+                x + (size_t)cur * (this->d / 8),
+                k,
+                outDistancesSlice.data(),
+                outIndicesSlice.data());
+    }
+}
+
+void GpuIndexBinaryFlat::reconstruct(
+        faiss::IndexBinary::idx_t key,
+        uint8_t* out) const {
+    DeviceScope scope(binaryFlatConfig_.device);
+
+    FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
+    auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
+
+    auto& vecs = data_->getVectorsRef();
+    auto vec = vecs[key];
+
+    fromDevice(vec.data(), out, vecs.getSize(1), stream);
+}
+
+} // namespace gpu
+} // namespace faiss
--- a/faiss/gpu/GpuIndexBinaryFlat.cu.prehip
+++ b/faiss/gpu/GpuIndexBinaryFlat.cu.prehip
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuIndexBinaryFlat.h>
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/impl/BinaryFlatIndex.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+
+namespace faiss {
+namespace gpu {
+
+/// Default CPU search size for which we use paged copies
+constexpr size_t kMinPageSize = (size_t)256 * 1024 * 1024;
+
+GpuIndexBinaryFlat::GpuIndexBinaryFlat(
+        GpuResourcesProvider* provider,
+        const faiss::IndexBinaryFlat* index,
+        GpuIndexBinaryFlatConfig config)
+        : IndexBinary(index->d),
+          resources_(provider->getResources()),
+          binaryFlatConfig_(config) {
+    FAISS_THROW_IF_NOT_FMT(
+            this->d % 8 == 0,
+            "vector dimension (number of bits) "
+            "must be divisible by 8 (passed %d)",
+            this->d);
+
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    copyFrom(index);
+}
+
+GpuIndexBinaryFlat::GpuIndexBinaryFlat(
+        GpuResourcesProvider* provider,
+        int dims,
+        GpuIndexBinaryFlatConfig config)
+        : IndexBinary(dims),
+          resources_(provider->getResources()),
+          binaryFlatConfig_(std::move(config)) {
+    FAISS_THROW_IF_NOT_FMT(
+            this->d % 8 == 0,
+            "vector dimension (number of bits) "
+            "must be divisible by 8 (passed %d)",
+            this->d);
+
+    // Flat index doesn't need training
+    this->is_trained = true;
+
+    // Construct index
+    DeviceScope scope(binaryFlatConfig_.device);
+    data_.reset(new BinaryFlatIndex(
+            resources_.get(), this->d, binaryFlatConfig_.memorySpace));
+}
+
+GpuIndexBinaryFlat::~GpuIndexBinaryFlat() {}
+
+int GpuIndexBinaryFlat::getDevice() const {
+    return binaryFlatConfig_.device;
+}
+
+std::shared_ptr<GpuResources> GpuIndexBinaryFlat::getResources() {
+    return resources_;
+}
+
+void GpuIndexBinaryFlat::copyFrom(const faiss::IndexBinaryFlat* index) {
+    DeviceScope scope(binaryFlatConfig_.device);
+
+    this->d = index->d;
+
+    // GPU code has 32 bit indices
+    FAISS_THROW_IF_NOT_FMT(
+            index->ntotal <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices; "
+            "attempting to copy CPU index with %zu parameters",
+            (size_t)std::numeric_limits<int>::max(),
+            (size_t)index->ntotal);
+    this->ntotal = index->ntotal;
+
+    // destroy old first before allocating new
+    data_.reset();
+    data_.reset(new BinaryFlatIndex(
+            resources_.get(), this->d, binaryFlatConfig_.memorySpace));
+
+    // The index could be empty
+    if (index->ntotal > 0) {
+        data_->add(
+                index->xb.data(),
+                index->ntotal,
+                resources_->getDefaultStream(binaryFlatConfig_.device));
+    }
+}
+
+void GpuIndexBinaryFlat::copyTo(faiss::IndexBinaryFlat* index) const {
+    DeviceScope scope(binaryFlatConfig_.device);
+
+    index->d = this->d;
+    index->ntotal = this->ntotal;
+
+    FAISS_ASSERT(data_);
+    FAISS_ASSERT(data_->getSize() == this->ntotal);
+    index->xb.resize(this->ntotal * (this->d / 8));
+
+    if (this->ntotal > 0) {
+        fromDevice(
+                data_->getVectorsRef(),
+                index->xb.data(),
+                resources_->getDefaultStream(binaryFlatConfig_.device));
+    }
+}
+
+void GpuIndexBinaryFlat::add(faiss::IndexBinary::idx_t n, const uint8_t* x) {
+    DeviceScope scope(binaryFlatConfig_.device);
+
+    // To avoid multiple re-allocations, ensure we have enough storage
+    // available
+    data_->reserve(n, resources_->getDefaultStream(binaryFlatConfig_.device));
+
+    // Due to GPU indexing in int32, we can't store more than this
+    // number of vectors on a GPU
+    FAISS_THROW_IF_NOT_FMT(
+            this->ntotal + n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices",
+            (size_t)std::numeric_limits<int>::max());
+
+    data_->add(
+            (const unsigned char*)x,
+            n,
+            resources_->getDefaultStream(binaryFlatConfig_.device));
+    this->ntotal += n;
+}
+
+void GpuIndexBinaryFlat::reset() {
+    DeviceScope scope(binaryFlatConfig_.device);
+
+    // Free the underlying memory
+    data_->reset();
+    this->ntotal = 0;
+}
+
+void GpuIndexBinaryFlat::search(
+        faiss::IndexBinary::idx_t n,
+        const uint8_t* x,
+        faiss::IndexBinary::idx_t k,
+        int32_t* distances,
+        faiss::IndexBinary::idx_t* labels) const {
+    if (n == 0) {
+        return;
+    }
+
+    FAISS_THROW_IF_NOT(k > 0);
+
+    // For now, only support <= max int results
+    FAISS_THROW_IF_NOT_FMT(
+            n <= (Index::idx_t)std::numeric_limits<int>::max(),
+            "GPU index only supports up to %zu indices",
+            (size_t)std::numeric_limits<int>::max());
+    FAISS_THROW_IF_NOT_FMT(
+            k <= (Index::idx_t)getMaxKSelection(),
+            "GPU only supports k <= %d (requested %d)",
+            getMaxKSelection(),
+            (int)k); // select limitation
+
+    DeviceScope scope(binaryFlatConfig_.device);
+    auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
+
+    // The input vectors may be too large for the GPU, but we still
+    // assume that the output distances and labels are not.
+    // Go ahead and make space for output distances and labels on the
+    // GPU.
+    // If we reach a point where all inputs are too big, we can add
+    // another level of tiling.
+    auto outDistances = toDeviceTemporary<int32_t, 2>(
+            resources_.get(),
+            binaryFlatConfig_.device,
+            distances,
+            stream,
+            {(int)n, (int)k});
+
+    // FlatIndex only supports an interface returning int indices
+    DeviceTensor<int, 2, true> outIntIndices(
+            resources_.get(),
+            makeTempAlloc(AllocType::Other, stream),
+            {(int)n, (int)k});
+
+    bool usePaged = false;
+
+    if (getDeviceForAddress(x) == -1) {
+        // It is possible that the user is querying for a vector set size
+        // `x` that won't fit on the GPU.
+        // In this case, we will have to handle paging of the data from CPU
+        // -> GPU.
+        // Currently, we don't handle the case where the output data won't
+        // fit on the GPU (e.g., n * k is too large for the GPU memory).
+        size_t dataSize = (size_t)n * (this->d / 8) * sizeof(uint8_t);
+
+        if (dataSize >= kMinPageSize) {
+            searchFromCpuPaged_(
+                    n, x, k, outDistances.data(), outIntIndices.data());
+            usePaged = true;
+        }
+    }
+
+    if (!usePaged) {
+        searchNonPaged_(n, x, k, outDistances.data(), outIntIndices.data());
+    }
+
+    // Convert and copy int indices out
+    auto outIndices = toDeviceTemporary<Index::idx_t, 2>(
+            resources_.get(),
+            binaryFlatConfig_.device,
+            labels,
+            stream,
+            {(int)n, (int)k});
+
+    // Convert int to idx_t
+    convertTensor<int, Index::idx_t, 2>(stream, outIntIndices, outIndices);
+
+    // Copy back if necessary
+    fromDevice<int32_t, 2>(outDistances, distances, stream);
+    fromDevice<Index::idx_t, 2>(outIndices, labels, stream);
+}
+
+void GpuIndexBinaryFlat::searchNonPaged_(
+        int n,
+        const uint8_t* x,
+        int k,
+        int32_t* outDistancesData,
+        int* outIndicesData) const {
+    Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
+    Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
+
+    auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
+
+    // Make sure arguments are on the device we desire; use temporary
+    // memory allocations to move it if necessary
+    auto vecs = toDeviceTemporary<uint8_t, 2>(
+            resources_.get(),
+            binaryFlatConfig_.device,
+            const_cast<uint8_t*>(x),
+            stream,
+            {n, (int)(this->d / 8)});
+
+    data_->query(vecs, k, outDistances, outIndices);
+}
+
+void GpuIndexBinaryFlat::searchFromCpuPaged_(
+        int n,
+        const uint8_t* x,
+        int k,
+        int32_t* outDistancesData,
+        int* outIndicesData) const {
+    Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
+    Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
+
+    auto vectorSize = sizeof(uint8_t) * (this->d / 8);
+
+    // Just page without overlapping copy with compute (as GpuIndexFlat does)
+    int batchSize = utils::nextHighestPowerOf2(
+            (int)((size_t)kMinPageSize / vectorSize));
+
+    for (int cur = 0; cur < n; cur += batchSize) {
+        int num = std::min(batchSize, n - cur);
+
+        auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
+        auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
+
+        searchNonPaged_(
+                num,
+                x + (size_t)cur * (this->d / 8),
+                k,
+                outDistancesSlice.data(),
+                outIndicesSlice.data());
+    }
+}
+
+void GpuIndexBinaryFlat::reconstruct(
+        faiss::IndexBinary::idx_t key,
+        uint8_t* out) const {
+    DeviceScope scope(binaryFlatConfig_.device);
+
+    FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
+    auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
+
+    auto& vecs = data_->getVectorsRef();
+    auto vec = vecs[key];
+
+    fromDevice(vec.data(), out, vecs.getSize(1), stream);
+}
+
+} // namespace gpu
+} // namespace faiss