Commit 395d2ce6 authored by huchen's avatar huchen
Browse files

init the faiss for rocm

parent 5ded39f5
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/GpuClonerOptions.h>
namespace faiss {
namespace gpu {
GpuClonerOptions::GpuClonerOptions()
: indicesOptions(INDICES_64_BIT),
useFloat16CoarseQuantizer(false),
useFloat16(false),
usePrecomputed(false),
reserveVecs(0),
storeTransposed(false),
verbose(false) {}
GpuMultipleClonerOptions::GpuMultipleClonerOptions()
: shard(false), shard_type(1) {}
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/GpuClonerOptions.h>
namespace faiss {
namespace gpu {
GpuClonerOptions::GpuClonerOptions()
: indicesOptions(INDICES_64_BIT),
useFloat16CoarseQuantizer(false),
useFloat16(false),
usePrecomputed(false),
reserveVecs(0),
storeTransposed(false),
verbose(false) {}
GpuMultipleClonerOptions::GpuMultipleClonerOptions()
: shard(false), shard_type(1) {}
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <faiss/gpu/GpuIndicesOptions.h>
namespace faiss {
namespace gpu {
/// set some options on how to copy to GPU
struct GpuClonerOptions {
GpuClonerOptions();
/// how should indices be stored on index types that support indices
/// (anything but GpuIndexFlat*)?
IndicesOptions indicesOptions;
/// is the coarse quantizer in float16?
bool useFloat16CoarseQuantizer;
/// for GpuIndexIVFFlat, is storage in float16?
/// for GpuIndexIVFPQ, are intermediate calculations in float16?
bool useFloat16;
/// use precomputed tables?
bool usePrecomputed;
/// reserve vectors in the invfiles?
long reserveVecs;
/// For GpuIndexFlat, store data in transposed layout?
bool storeTransposed;
/// Set verbose options on the index
bool verbose;
};
struct GpuMultipleClonerOptions : public GpuClonerOptions {
GpuMultipleClonerOptions();
/// Whether to shard the index across GPUs, versus replication
/// across GPUs
bool shard;
/// IndexIVF::copy_subset_to subset type
int shard_type;
};
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <faiss/gpu/GpuIndicesOptions.h>
namespace faiss {
namespace gpu {
/// set some options on how to copy to GPU
struct GpuClonerOptions {
GpuClonerOptions();
/// how should indices be stored on index types that support indices
/// (anything but GpuIndexFlat*)?
IndicesOptions indicesOptions;
/// is the coarse quantizer in float16?
bool useFloat16CoarseQuantizer;
/// for GpuIndexIVFFlat, is storage in float16?
/// for GpuIndexIVFPQ, are intermediate calculations in float16?
bool useFloat16;
/// use precomputed tables?
bool usePrecomputed;
/// reserve vectors in the invfiles?
long reserveVecs;
/// For GpuIndexFlat, store data in transposed layout?
bool storeTransposed;
/// Set verbose options on the index
bool verbose;
};
struct GpuMultipleClonerOptions : public GpuClonerOptions {
GpuMultipleClonerOptions();
/// Whether to shard the index across GPUs, versus replication
/// across GPUs
bool shard;
/// IndexIVF::copy_subset_to subset type
int shard_type;
};
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/GpuDistance.h>
#include <faiss/gpu/GpuResources.h>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/impl/FaissAssert.h>
#include <faiss/gpu/impl/Distance.cuh>
#include <faiss/gpu/utils/ConversionOperators.cuh>
#include <faiss/gpu/utils/CopyUtils.cuh>
#include <faiss/gpu/utils/DeviceTensor.cuh>
namespace faiss {
namespace gpu {
template <typename T>
void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
// Validate the input data
FAISS_THROW_IF_NOT_MSG(
args.k > 0 || args.k == -1,
"bfKnn: k must be > 0 for top-k reduction, "
"or -1 for all pairwise distances");
FAISS_THROW_IF_NOT_MSG(args.dims > 0, "bfKnn: dims must be > 0");
FAISS_THROW_IF_NOT_MSG(
args.numVectors > 0, "bfKnn: numVectors must be > 0");
FAISS_THROW_IF_NOT_MSG(
args.vectors, "bfKnn: vectors must be provided (passed null)");
FAISS_THROW_IF_NOT_MSG(
args.numQueries > 0, "bfKnn: numQueries must be > 0");
FAISS_THROW_IF_NOT_MSG(
args.queries, "bfKnn: queries must be provided (passed null)");
FAISS_THROW_IF_NOT_MSG(
args.outDistances,
"bfKnn: outDistances must be provided (passed null)");
FAISS_THROW_IF_NOT_MSG(
args.outIndices || args.k == -1,
"bfKnn: outIndices must be provided (passed null)");
// Don't let the resources go out of scope
auto resImpl = prov->getResources();
auto res = resImpl.get();
auto device = getCurrentDevice();
auto stream = res->getDefaultStreamCurrentDevice();
auto tVectors = toDeviceTemporary<T, 2>(
res,
device,
const_cast<T*>(reinterpret_cast<const T*>(args.vectors)),
stream,
{args.vectorsRowMajor ? args.numVectors : args.dims,
args.vectorsRowMajor ? args.dims : args.numVectors});
auto tQueries = toDeviceTemporary<T, 2>(
res,
device,
const_cast<T*>(reinterpret_cast<const T*>(args.queries)),
stream,
{args.queriesRowMajor ? args.numQueries : args.dims,
args.queriesRowMajor ? args.dims : args.numQueries});
DeviceTensor<float, 1, true> tVectorNorms;
if (args.vectorNorms) {
tVectorNorms = toDeviceTemporary<float, 1>(
res,
device,
const_cast<float*>(args.vectorNorms),
stream,
{args.numVectors});
}
auto tOutDistances = toDeviceTemporary<float, 2>(
res,
device,
args.outDistances,
stream,
{args.numQueries, args.k == -1 ? args.numVectors : args.k});
if (args.k == -1) {
// Reporting all pairwise distances
allPairwiseDistanceOnDevice<T>(
res,
device,
stream,
tVectors,
args.vectorsRowMajor,
args.vectorNorms ? &tVectorNorms : nullptr,
tQueries,
args.queriesRowMajor,
args.metric,
args.metricArg,
tOutDistances);
} else if (args.outIndicesType == IndicesDataType::I64) {
// The brute-force API only supports an interface for i32 indices only,
// so we must create an output i32 buffer then convert back
DeviceTensor<int, 2, true> tOutIntIndices(
res,
makeTempAlloc(AllocType::Other, stream),
{args.numQueries, args.k});
// Since we've guaranteed that all arguments are on device, call the
// implementation
bfKnnOnDevice<T>(
res,
device,
stream,
tVectors,
args.vectorsRowMajor,
args.vectorNorms ? &tVectorNorms : nullptr,
tQueries,
args.queriesRowMajor,
args.k,
args.metric,
args.metricArg,
tOutDistances,
tOutIntIndices,
args.ignoreOutDistances);
// Convert and copy int indices out
auto tOutIndices = toDeviceTemporary<Index::idx_t, 2>(
res,
device,
(Index::idx_t*)args.outIndices,
stream,
{args.numQueries, args.k});
// Convert int to idx_t
convertTensor<int, Index::idx_t, 2>(
stream, tOutIntIndices, tOutIndices);
// Copy back if necessary
fromDevice<Index::idx_t, 2>(
tOutIndices, (Index::idx_t*)args.outIndices, stream);
} else if (args.outIndicesType == IndicesDataType::I32) {
// We can use the brute-force API directly, as it takes i32 indices
// FIXME: convert to int32_t everywhere?
static_assert(sizeof(int) == 4, "");
auto tOutIntIndices = toDeviceTemporary<int, 2>(
res,
device,
(int*)args.outIndices,
stream,
{args.numQueries, args.k});
// Since we've guaranteed that all arguments are on device, call the
// implementation
bfKnnOnDevice<T>(
res,
device,
stream,
tVectors,
args.vectorsRowMajor,
args.vectorNorms ? &tVectorNorms : nullptr,
tQueries,
args.queriesRowMajor,
args.k,
args.metric,
args.metricArg,
tOutDistances,
tOutIntIndices,
args.ignoreOutDistances);
// Copy back if necessary
fromDevice<int, 2>(tOutIntIndices, (int*)args.outIndices, stream);
} else {
FAISS_THROW_MSG("unknown outIndicesType");
}
// Copy distances back if necessary
fromDevice<float, 2>(tOutDistances, args.outDistances, stream);
}
void bfKnn(GpuResourcesProvider* res, const GpuDistanceParams& args) {
// For now, both vectors and queries must be of the same data type
FAISS_THROW_IF_NOT_MSG(
args.vectorType == args.queryType,
"limitation: both vectorType and queryType must currently "
"be the same (F32 or F16");
if (args.vectorType == DistanceDataType::F32) {
bfKnnConvert<float>(res, args);
} else if (args.vectorType == DistanceDataType::F16) {
bfKnnConvert<half>(res, args);
} else {
FAISS_THROW_MSG("unknown vectorType");
}
}
// legacy version
void bruteForceKnn(
GpuResourcesProvider* res,
faiss::MetricType metric,
// A region of memory size numVectors x dims, with dims
// innermost
const float* vectors,
bool vectorsRowMajor,
int numVectors,
// A region of memory size numQueries x dims, with dims
// innermost
const float* queries,
bool queriesRowMajor,
int numQueries,
int dims,
int k,
// A region of memory size numQueries x k, with k
// innermost
float* outDistances,
// A region of memory size numQueries x k, with k
// innermost
Index::idx_t* outIndices) {
std::cerr << "bruteForceKnn is deprecated; call bfKnn instead" << std::endl;
GpuDistanceParams args;
args.metric = metric;
args.k = k;
args.dims = dims;
args.vectors = vectors;
args.vectorsRowMajor = vectorsRowMajor;
args.numVectors = numVectors;
args.queries = queries;
args.queriesRowMajor = queriesRowMajor;
args.numQueries = numQueries;
args.outDistances = outDistances;
args.outIndices = outIndices;
bfKnn(res, args);
}
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/GpuDistance.h>
#include <faiss/gpu/GpuResources.h>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/impl/FaissAssert.h>
#include <faiss/gpu/impl/Distance.cuh>
#include <faiss/gpu/utils/ConversionOperators.cuh>
#include <faiss/gpu/utils/CopyUtils.cuh>
#include <faiss/gpu/utils/DeviceTensor.cuh>
namespace faiss {
namespace gpu {
template <typename T>
void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
// Validate the input data
FAISS_THROW_IF_NOT_MSG(
args.k > 0 || args.k == -1,
"bfKnn: k must be > 0 for top-k reduction, "
"or -1 for all pairwise distances");
FAISS_THROW_IF_NOT_MSG(args.dims > 0, "bfKnn: dims must be > 0");
FAISS_THROW_IF_NOT_MSG(
args.numVectors > 0, "bfKnn: numVectors must be > 0");
FAISS_THROW_IF_NOT_MSG(
args.vectors, "bfKnn: vectors must be provided (passed null)");
FAISS_THROW_IF_NOT_MSG(
args.numQueries > 0, "bfKnn: numQueries must be > 0");
FAISS_THROW_IF_NOT_MSG(
args.queries, "bfKnn: queries must be provided (passed null)");
FAISS_THROW_IF_NOT_MSG(
args.outDistances,
"bfKnn: outDistances must be provided (passed null)");
FAISS_THROW_IF_NOT_MSG(
args.outIndices || args.k == -1,
"bfKnn: outIndices must be provided (passed null)");
// Don't let the resources go out of scope
auto resImpl = prov->getResources();
auto res = resImpl.get();
auto device = getCurrentDevice();
auto stream = res->getDefaultStreamCurrentDevice();
auto tVectors = toDeviceTemporary<T, 2>(
res,
device,
const_cast<T*>(reinterpret_cast<const T*>(args.vectors)),
stream,
{args.vectorsRowMajor ? args.numVectors : args.dims,
args.vectorsRowMajor ? args.dims : args.numVectors});
auto tQueries = toDeviceTemporary<T, 2>(
res,
device,
const_cast<T*>(reinterpret_cast<const T*>(args.queries)),
stream,
{args.queriesRowMajor ? args.numQueries : args.dims,
args.queriesRowMajor ? args.dims : args.numQueries});
DeviceTensor<float, 1, true> tVectorNorms;
if (args.vectorNorms) {
tVectorNorms = toDeviceTemporary<float, 1>(
res,
device,
const_cast<float*>(args.vectorNorms),
stream,
{args.numVectors});
}
auto tOutDistances = toDeviceTemporary<float, 2>(
res,
device,
args.outDistances,
stream,
{args.numQueries, args.k == -1 ? args.numVectors : args.k});
if (args.k == -1) {
// Reporting all pairwise distances
allPairwiseDistanceOnDevice<T>(
res,
device,
stream,
tVectors,
args.vectorsRowMajor,
args.vectorNorms ? &tVectorNorms : nullptr,
tQueries,
args.queriesRowMajor,
args.metric,
args.metricArg,
tOutDistances);
} else if (args.outIndicesType == IndicesDataType::I64) {
// The brute-force API only supports an interface for i32 indices only,
// so we must create an output i32 buffer then convert back
DeviceTensor<int, 2, true> tOutIntIndices(
res,
makeTempAlloc(AllocType::Other, stream),
{args.numQueries, args.k});
// Since we've guaranteed that all arguments are on device, call the
// implementation
bfKnnOnDevice<T>(
res,
device,
stream,
tVectors,
args.vectorsRowMajor,
args.vectorNorms ? &tVectorNorms : nullptr,
tQueries,
args.queriesRowMajor,
args.k,
args.metric,
args.metricArg,
tOutDistances,
tOutIntIndices,
args.ignoreOutDistances);
// Convert and copy int indices out
auto tOutIndices = toDeviceTemporary<Index::idx_t, 2>(
res,
device,
(Index::idx_t*)args.outIndices,
stream,
{args.numQueries, args.k});
// Convert int to idx_t
convertTensor<int, Index::idx_t, 2>(
stream, tOutIntIndices, tOutIndices);
// Copy back if necessary
fromDevice<Index::idx_t, 2>(
tOutIndices, (Index::idx_t*)args.outIndices, stream);
} else if (args.outIndicesType == IndicesDataType::I32) {
// We can use the brute-force API directly, as it takes i32 indices
// FIXME: convert to int32_t everywhere?
static_assert(sizeof(int) == 4, "");
auto tOutIntIndices = toDeviceTemporary<int, 2>(
res,
device,
(int*)args.outIndices,
stream,
{args.numQueries, args.k});
// Since we've guaranteed that all arguments are on device, call the
// implementation
bfKnnOnDevice<T>(
res,
device,
stream,
tVectors,
args.vectorsRowMajor,
args.vectorNorms ? &tVectorNorms : nullptr,
tQueries,
args.queriesRowMajor,
args.k,
args.metric,
args.metricArg,
tOutDistances,
tOutIntIndices,
args.ignoreOutDistances);
// Copy back if necessary
fromDevice<int, 2>(tOutIntIndices, (int*)args.outIndices, stream);
} else {
FAISS_THROW_MSG("unknown outIndicesType");
}
// Copy distances back if necessary
fromDevice<float, 2>(tOutDistances, args.outDistances, stream);
}
void bfKnn(GpuResourcesProvider* res, const GpuDistanceParams& args) {
// For now, both vectors and queries must be of the same data type
FAISS_THROW_IF_NOT_MSG(
args.vectorType == args.queryType,
"limitation: both vectorType and queryType must currently "
"be the same (F32 or F16");
if (args.vectorType == DistanceDataType::F32) {
bfKnnConvert<float>(res, args);
} else if (args.vectorType == DistanceDataType::F16) {
bfKnnConvert<half>(res, args);
} else {
FAISS_THROW_MSG("unknown vectorType");
}
}
// legacy version
void bruteForceKnn(
GpuResourcesProvider* res,
faiss::MetricType metric,
// A region of memory size numVectors x dims, with dims
// innermost
const float* vectors,
bool vectorsRowMajor,
int numVectors,
// A region of memory size numQueries x dims, with dims
// innermost
const float* queries,
bool queriesRowMajor,
int numQueries,
int dims,
int k,
// A region of memory size numQueries x k, with k
// innermost
float* outDistances,
// A region of memory size numQueries x k, with k
// innermost
Index::idx_t* outIndices) {
std::cerr << "bruteForceKnn is deprecated; call bfKnn instead" << std::endl;
GpuDistanceParams args;
args.metric = metric;
args.k = k;
args.dims = dims;
args.vectors = vectors;
args.vectorsRowMajor = vectorsRowMajor;
args.numVectors = numVectors;
args.queries = queries;
args.queriesRowMajor = queriesRowMajor;
args.numQueries = numQueries;
args.outDistances = outDistances;
args.outIndices = outIndices;
bfKnn(res, args);
}
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <faiss/Index.h>
namespace faiss {
namespace gpu {
class GpuResourcesProvider;
// Scalar type of the vector data
enum class DistanceDataType {
F32 = 1,
F16,
};
// Scalar type of the indices data
enum class IndicesDataType {
I64 = 1,
I32,
};
/// Arguments to brute-force GPU k-nearest neighbor searching
struct GpuDistanceParams {
GpuDistanceParams()
: metric(faiss::MetricType::METRIC_L2),
metricArg(0),
k(0),
dims(0),
vectors(nullptr),
vectorType(DistanceDataType::F32),
vectorsRowMajor(true),
numVectors(0),
vectorNorms(nullptr),
queries(nullptr),
queryType(DistanceDataType::F32),
queriesRowMajor(true),
numQueries(0),
outDistances(nullptr),
ignoreOutDistances(false),
outIndicesType(IndicesDataType::I64),
outIndices(nullptr) {}
//
// Search parameters
//
/// Search parameter: distance metric
faiss::MetricType metric;
/// Search parameter: distance metric argument (if applicable)
/// For metric == METRIC_Lp, this is the p-value
float metricArg;
/// Search parameter: return k nearest neighbors
/// If the value provided is -1, then we report all pairwise distances
/// without top-k filtering
int k;
/// Vector dimensionality
int dims;
//
// Vectors being queried
//
/// If vectorsRowMajor is true, this is
/// numVectors x dims, with dims innermost; otherwise,
/// dims x numVectors, with numVectors innermost
const void* vectors;
DistanceDataType vectorType;
bool vectorsRowMajor;
int numVectors;
/// Precomputed L2 norms for each vector in `vectors`, which can be
/// optionally provided in advance to speed computation for METRIC_L2
const float* vectorNorms;
//
// The query vectors (i.e., find k-nearest neighbors in `vectors` for each
// of the `queries`
//
/// If queriesRowMajor is true, this is
/// numQueries x dims, with dims innermost; otherwise,
/// dims x numQueries, with numQueries innermost
const void* queries;
DistanceDataType queryType;
bool queriesRowMajor;
int numQueries;
//
// Output results
//
/// A region of memory size numQueries x k, with k
/// innermost (row major) if k > 0, or if k == -1, a region of memory of
/// size numQueries x numVectors
float* outDistances;
/// Do we only care about the indices reported, rather than the output
/// distances? Not used if k == -1 (all pairwise distances)
bool ignoreOutDistances;
/// A region of memory size numQueries x k, with k
/// innermost (row major). Not used if k == -1 (all pairwise distances)
IndicesDataType outIndicesType;
void* outIndices;
};
/// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
/// neighbor searches on an externally-provided region of memory (e.g., from a
/// pytorch tensor).
/// The data (vectors, queries, outDistances, outIndices) can be resident on the
/// GPU or the CPU, but all calculations are performed on the GPU. If the result
/// buffers are on the CPU, results will be copied back when done.
///
/// All GPU computation is performed on the current CUDA device, and ordered
/// with respect to resources->getDefaultStreamCurrentDevice().
///
/// For each vector in `queries`, searches all of `vectors` to find its k
/// nearest neighbors with respect to the given metric
void bfKnn(GpuResourcesProvider* resources, const GpuDistanceParams& args);
/// Deprecated legacy implementation
void bruteForceKnn(
GpuResourcesProvider* resources,
faiss::MetricType metric,
// If vectorsRowMajor is true, this is
// numVectors x dims, with dims innermost; otherwise,
// dims x numVectors, with numVectors innermost
const float* vectors,
bool vectorsRowMajor,
int numVectors,
// If queriesRowMajor is true, this is
// numQueries x dims, with dims innermost; otherwise,
// dims x numQueries, with numQueries innermost
const float* queries,
bool queriesRowMajor,
int numQueries,
int dims,
int k,
// A region of memory size numQueries x k, with k
// innermost (row major)
float* outDistances,
// A region of memory size numQueries x k, with k
// innermost (row major)
Index::idx_t* outIndices);
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <faiss/Index.h>
namespace faiss {
namespace gpu {
class GpuResourcesProvider;
// Scalar type of the vector data
enum class DistanceDataType {
F32 = 1,
F16,
};
// Scalar type of the indices data
enum class IndicesDataType {
I64 = 1,
I32,
};
/// Arguments to brute-force GPU k-nearest neighbor searching
struct GpuDistanceParams {
GpuDistanceParams()
: metric(faiss::MetricType::METRIC_L2),
metricArg(0),
k(0),
dims(0),
vectors(nullptr),
vectorType(DistanceDataType::F32),
vectorsRowMajor(true),
numVectors(0),
vectorNorms(nullptr),
queries(nullptr),
queryType(DistanceDataType::F32),
queriesRowMajor(true),
numQueries(0),
outDistances(nullptr),
ignoreOutDistances(false),
outIndicesType(IndicesDataType::I64),
outIndices(nullptr) {}
//
// Search parameters
//
/// Search parameter: distance metric
faiss::MetricType metric;
/// Search parameter: distance metric argument (if applicable)
/// For metric == METRIC_Lp, this is the p-value
float metricArg;
/// Search parameter: return k nearest neighbors
/// If the value provided is -1, then we report all pairwise distances
/// without top-k filtering
int k;
/// Vector dimensionality
int dims;
//
// Vectors being queried
//
/// If vectorsRowMajor is true, this is
/// numVectors x dims, with dims innermost; otherwise,
/// dims x numVectors, with numVectors innermost
const void* vectors;
DistanceDataType vectorType;
bool vectorsRowMajor;
int numVectors;
/// Precomputed L2 norms for each vector in `vectors`, which can be
/// optionally provided in advance to speed computation for METRIC_L2
const float* vectorNorms;
//
// The query vectors (i.e., find k-nearest neighbors in `vectors` for each
// of the `queries`
//
/// If queriesRowMajor is true, this is
/// numQueries x dims, with dims innermost; otherwise,
/// dims x numQueries, with numQueries innermost
const void* queries;
DistanceDataType queryType;
bool queriesRowMajor;
int numQueries;
//
// Output results
//
/// A region of memory size numQueries x k, with k
/// innermost (row major) if k > 0, or if k == -1, a region of memory of
/// size numQueries x numVectors
float* outDistances;
/// Do we only care about the indices reported, rather than the output
/// distances? Not used if k == -1 (all pairwise distances)
bool ignoreOutDistances;
/// A region of memory size numQueries x k, with k
/// innermost (row major). Not used if k == -1 (all pairwise distances)
IndicesDataType outIndicesType;
void* outIndices;
};
/// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
/// neighbor searches on an externally-provided region of memory (e.g., from a
/// pytorch tensor).
/// The data (vectors, queries, outDistances, outIndices) can be resident on the
/// GPU or the CPU, but all calculations are performed on the GPU. If the result
/// buffers are on the CPU, results will be copied back when done.
///
/// All GPU computation is performed on the current CUDA device, and ordered
/// with respect to resources->getDefaultStreamCurrentDevice().
///
/// For each vector in `queries`, searches all of `vectors` to find its k
/// nearest neighbors with respect to the given metric
void bfKnn(GpuResourcesProvider* resources, const GpuDistanceParams& args);
/// Deprecated legacy implementation
void bruteForceKnn(
GpuResourcesProvider* resources,
faiss::MetricType metric,
// If vectorsRowMajor is true, this is
// numVectors x dims, with dims innermost; otherwise,
// dims x numVectors, with numVectors innermost
const float* vectors,
bool vectorsRowMajor,
int numVectors,
// If queriesRowMajor is true, this is
// numQueries x dims, with dims innermost; otherwise,
// dims x numQueries, with numQueries innermost
const float* queries,
bool queriesRowMajor,
int numQueries,
int dims,
int k,
// A region of memory size numQueries x k, with k
// innermost (row major)
float* outDistances,
// A region of memory size numQueries x k, with k
// innermost (row major)
Index::idx_t* outIndices);
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#ifndef GPU_FAISS_ASSERT_INCLUDED
#define GPU_FAISS_ASSERT_INCLUDED
#include <hip/hip_runtime.h>
#include <faiss/impl/FaissAssert.h>
///
/// Assertions
///
#ifdef __CUDA_ARCH__
#define GPU_FAISS_ASSERT(X) assert(X)
#define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)
#else
#define GPU_FAISS_ASSERT(X) FAISS_ASSERT(X)
#define GPU_FAISS_ASSERT_MSG(X, MSG) FAISS_ASSERT_MSG(X, MSG)
#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) FAISS_ASSERT_FMT(X, FMT, __VA_ARGS)
#endif // __CUDA_ARCH__
#endif
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#ifndef GPU_FAISS_ASSERT_INCLUDED
#define GPU_FAISS_ASSERT_INCLUDED
#include <cuda.h>
#include <faiss/impl/FaissAssert.h>
///
/// Assertions
///
#ifdef __CUDA_ARCH__
#define GPU_FAISS_ASSERT(X) assert(X)
#define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)
#else
#define GPU_FAISS_ASSERT(X) FAISS_ASSERT(X)
#define GPU_FAISS_ASSERT_MSG(X, MSG) FAISS_ASSERT_MSG(X, MSG)
#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) FAISS_ASSERT_FMT(X, FMT, __VA_ARGS)
#endif // __CUDA_ARCH__
#endif
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/GpuIcmEncoder.h>
#include <faiss/gpu/StandardGpuResources.h>
#include <faiss/utils/WorkerThread.h>
#include <faiss/gpu/impl/IcmEncoder.cuh>
#include <algorithm>
namespace faiss {
namespace gpu {
///< A helper structure to support multi-GPU
struct IcmEncoderShards {
std::vector<std::pair<
std::unique_ptr<IcmEncoderImpl>,
std::unique_ptr<WorkerThread>>>
workers;
void add(IcmEncoderImpl* encoder) {
workers.emplace_back(std::make_pair(
std::unique_ptr<IcmEncoderImpl>(encoder),
std::unique_ptr<WorkerThread>(new WorkerThread)));
}
IcmEncoderImpl* at(int idx) {
return workers[idx].first.get();
}
///< call f(idx, encoder) for each encoder
void runOnShards(std::function<void(int, IcmEncoderImpl*)> f) {
std::vector<std::future<bool>> v;
for (int i = 0; i < this->workers.size(); ++i) {
auto& p = this->workers[i];
auto encoder = p.first.get();
v.emplace_back(p.second->add([f, i, encoder]() { f(i, encoder); }));
}
for (int i = 0; i < v.size(); ++i) {
auto& fut = v[i];
fut.get(); // no exception handle, crash if any thread down
}
}
size_t size() {
return workers.size();
}
};
GpuIcmEncoder::GpuIcmEncoder(
const LocalSearchQuantizer* lsq,
const std::vector<GpuResourcesProvider*>& provs,
const std::vector<int>& devices)
: lsq::IcmEncoder(lsq), shards(new IcmEncoderShards()) {
// create an IcmEncoderImpl instance for each device.
for (size_t i = 0; i < provs.size(); i++) {
shards->add(new IcmEncoderImpl(
lsq->M, lsq->K, lsq->d, provs[i], devices[i]));
}
}
GpuIcmEncoder::~GpuIcmEncoder() {}
void GpuIcmEncoder::set_binary_term() {
auto fn = [=](int idx, IcmEncoderImpl* encoder) {
encoder->setBinaryTerm(lsq->codebooks.data());
};
shards->runOnShards(fn);
}
void GpuIcmEncoder::encode(
int32_t* codes,
const float* x,
std::mt19937& gen,
size_t n,
size_t ils_iters) const {
size_t nshards = shards->size();
size_t shard_size = (n + nshards - 1) / nshards;
auto codebooks = lsq->codebooks.data();
auto M = lsq->M;
auto d = lsq->d;
auto nperts = lsq->nperts;
auto icm_iters = lsq->icm_iters;
auto seed = gen();
// split input data
auto fn = [=](int idx, IcmEncoderImpl* encoder) {
size_t i0 = idx * shard_size;
size_t ni = std::min(shard_size, n - i0);
auto xi = x + i0 * d;
auto ci = codes + i0 * M;
std::mt19937 geni(idx + seed); // different seed for each shard
encoder->encode(
ci, xi, codebooks, geni, ni, nperts, ils_iters, icm_iters);
};
shards->runOnShards(fn);
}
GpuIcmEncoderFactory::GpuIcmEncoderFactory(int ngpus) {
for (int i = 0; i < ngpus; i++) {
provs.push_back(new StandardGpuResources());
devices.push_back(i);
}
}
lsq::IcmEncoder* GpuIcmEncoderFactory::get(const LocalSearchQuantizer* lsq) {
return new GpuIcmEncoder(lsq, provs, devices);
}
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/GpuIcmEncoder.h>
#include <faiss/gpu/StandardGpuResources.h>
#include <faiss/utils/WorkerThread.h>
#include <faiss/gpu/impl/IcmEncoder.cuh>
#include <algorithm>
namespace faiss {
namespace gpu {
///< A helper structure to support multi-GPU
struct IcmEncoderShards {
std::vector<std::pair<
std::unique_ptr<IcmEncoderImpl>,
std::unique_ptr<WorkerThread>>>
workers;
void add(IcmEncoderImpl* encoder) {
workers.emplace_back(std::make_pair(
std::unique_ptr<IcmEncoderImpl>(encoder),
std::unique_ptr<WorkerThread>(new WorkerThread)));
}
IcmEncoderImpl* at(int idx) {
return workers[idx].first.get();
}
///< call f(idx, encoder) for each encoder
void runOnShards(std::function<void(int, IcmEncoderImpl*)> f) {
std::vector<std::future<bool>> v;
for (int i = 0; i < this->workers.size(); ++i) {
auto& p = this->workers[i];
auto encoder = p.first.get();
v.emplace_back(p.second->add([f, i, encoder]() { f(i, encoder); }));
}
for (int i = 0; i < v.size(); ++i) {
auto& fut = v[i];
fut.get(); // no exception handle, crash if any thread down
}
}
size_t size() {
return workers.size();
}
};
GpuIcmEncoder::GpuIcmEncoder(
const LocalSearchQuantizer* lsq,
const std::vector<GpuResourcesProvider*>& provs,
const std::vector<int>& devices)
: lsq::IcmEncoder(lsq), shards(new IcmEncoderShards()) {
// create an IcmEncoderImpl instance for each device.
for (size_t i = 0; i < provs.size(); i++) {
shards->add(new IcmEncoderImpl(
lsq->M, lsq->K, lsq->d, provs[i], devices[i]));
}
}
GpuIcmEncoder::~GpuIcmEncoder() {}
void GpuIcmEncoder::set_binary_term() {
auto fn = [=](int idx, IcmEncoderImpl* encoder) {
encoder->setBinaryTerm(lsq->codebooks.data());
};
shards->runOnShards(fn);
}
void GpuIcmEncoder::encode(
int32_t* codes,
const float* x,
std::mt19937& gen,
size_t n,
size_t ils_iters) const {
size_t nshards = shards->size();
size_t shard_size = (n + nshards - 1) / nshards;
auto codebooks = lsq->codebooks.data();
auto M = lsq->M;
auto d = lsq->d;
auto nperts = lsq->nperts;
auto icm_iters = lsq->icm_iters;
auto seed = gen();
// split input data
auto fn = [=](int idx, IcmEncoderImpl* encoder) {
size_t i0 = idx * shard_size;
size_t ni = std::min(shard_size, n - i0);
auto xi = x + i0 * d;
auto ci = codes + i0 * M;
std::mt19937 geni(idx + seed); // different seed for each shard
encoder->encode(
ci, xi, codebooks, geni, ni, nperts, ils_iters, icm_iters);
};
shards->runOnShards(fn);
}
GpuIcmEncoderFactory::GpuIcmEncoderFactory(int ngpus) {
for (int i = 0; i < ngpus; i++) {
provs.push_back(new StandardGpuResources());
devices.push_back(i);
}
}
lsq::IcmEncoder* GpuIcmEncoderFactory::get(const LocalSearchQuantizer* lsq) {
return new GpuIcmEncoder(lsq, provs, devices);
}
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <faiss/impl/LocalSearchQuantizer.h>
#include <memory>
namespace faiss {
namespace gpu {
class GpuResourcesProvider;
struct IcmEncoderShards;
/** Perform LSQ encoding on GPU.
*
* Split input vectors to different devices and call IcmEncoderImpl::encode
* to encode them
*/
class GpuIcmEncoder : public lsq::IcmEncoder {
public:
GpuIcmEncoder(
const LocalSearchQuantizer* lsq,
const std::vector<GpuResourcesProvider*>& provs,
const std::vector<int>& devices);
~GpuIcmEncoder();
GpuIcmEncoder(const GpuIcmEncoder&) = delete;
GpuIcmEncoder& operator=(const GpuIcmEncoder&) = delete;
void set_binary_term() override;
void encode(
int32_t* codes,
const float* x,
std::mt19937& gen,
size_t n,
size_t ils_iters) const override;
private:
std::unique_ptr<IcmEncoderShards> shards;
};
struct GpuIcmEncoderFactory : public lsq::IcmEncoderFactory {
explicit GpuIcmEncoderFactory(int ngpus = 1);
lsq::IcmEncoder* get(const LocalSearchQuantizer* lsq) override;
std::vector<GpuResourcesProvider*> provs;
std::vector<int> devices;
};
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <faiss/impl/LocalSearchQuantizer.h>
#include <memory>
namespace faiss {
namespace gpu {
class GpuResourcesProvider;
struct IcmEncoderShards;
/** Perform LSQ encoding on GPU.
*
* Split input vectors to different devices and call IcmEncoderImpl::encode
* to encode them
*/
class GpuIcmEncoder : public lsq::IcmEncoder {
public:
GpuIcmEncoder(
const LocalSearchQuantizer* lsq,
const std::vector<GpuResourcesProvider*>& provs,
const std::vector<int>& devices);
~GpuIcmEncoder();
GpuIcmEncoder(const GpuIcmEncoder&) = delete;
GpuIcmEncoder& operator=(const GpuIcmEncoder&) = delete;
void set_binary_term() override;
void encode(
int32_t* codes,
const float* x,
std::mt19937& gen,
size_t n,
size_t ils_iters) const override;
private:
std::unique_ptr<IcmEncoderShards> shards;
};
struct GpuIcmEncoderFactory : public lsq::IcmEncoderFactory {
explicit GpuIcmEncoderFactory(int ngpus = 1);
lsq::IcmEncoder* get(const LocalSearchQuantizer* lsq) override;
std::vector<GpuResourcesProvider*> provs;
std::vector<int> devices;
};
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/GpuIndex.h>
#include <faiss/gpu/GpuResources.h>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/utils/StaticUtils.h>
#include <faiss/impl/FaissAssert.h>
#include <faiss/gpu/utils/CopyUtils.cuh>
#include <algorithm>
#include <limits>
#include <memory>
namespace faiss {
namespace gpu {
/// Default CPU search size for which we use paged copies
constexpr size_t kMinPageSize = (size_t)256 * 1024 * 1024;
/// Size above which we page copies from the CPU to GPU (non-paged
/// memory usage)
constexpr size_t kNonPinnedPageSize = (size_t)256 * 1024 * 1024;
// Default size for which we page add or search
constexpr size_t kAddPageSize = (size_t)256 * 1024 * 1024;
// Or, maximum number of vectors to consider per page of add or search
constexpr size_t kAddVecSize = (size_t)512 * 1024;
// Use a smaller search size, as precomputed code usage on IVFPQ
// requires substantial amounts of memory
// FIXME: parameterize based on algorithm need
constexpr size_t kSearchVecSize = (size_t)32 * 1024;
GpuIndex::GpuIndex(
std::shared_ptr<GpuResources> resources,
int dims,
faiss::MetricType metric,
float metricArg,
GpuIndexConfig config)
: Index(dims, metric),
resources_(resources),
config_(config),
minPagedSize_(kMinPageSize) {
FAISS_THROW_IF_NOT_FMT(
config_.device < getNumDevices(),
"Invalid GPU device %d",
config_.device);
FAISS_THROW_IF_NOT_MSG(dims > 0, "Invalid number of dimensions");
FAISS_THROW_IF_NOT_FMT(
config_.memorySpace == MemorySpace::Device ||
(config_.memorySpace == MemorySpace::Unified &&
getFullUnifiedMemSupport(config_.device)),
"Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",
config.device);
metric_arg = metricArg;
FAISS_ASSERT((bool)resources_);
resources_->initializeForDevice(config_.device);
}
int GpuIndex::getDevice() const {
return config_.device;
}
void GpuIndex::copyFrom(const faiss::Index* index) {
d = index->d;
metric_type = index->metric_type;
metric_arg = index->metric_arg;
ntotal = index->ntotal;
is_trained = index->is_trained;
}
void GpuIndex::copyTo(faiss::Index* index) const {
index->d = d;
index->metric_type = metric_type;
index->metric_arg = metric_arg;
index->ntotal = ntotal;
index->is_trained = is_trained;
}
void GpuIndex::setMinPagingSize(size_t size) {
minPagedSize_ = size;
}
size_t GpuIndex::getMinPagingSize() const {
return minPagedSize_;
}
void GpuIndex::add(Index::idx_t n, const float* x) {
// Pass to add_with_ids
add_with_ids(n, x, nullptr);
}
void GpuIndex::add_with_ids(
Index::idx_t n,
const float* x,
const Index::idx_t* ids) {
FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
// For now, only support <= max int results
FAISS_THROW_IF_NOT_FMT(
n <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %d indices",
std::numeric_limits<int>::max());
if (n == 0) {
// nothing to add
return;
}
std::vector<Index::idx_t> generatedIds;
// Generate IDs if we need them
if (!ids && addImplRequiresIDs_()) {
generatedIds = std::vector<Index::idx_t>(n);
for (Index::idx_t i = 0; i < n; ++i) {
generatedIds[i] = this->ntotal + i;
}
}
DeviceScope scope(config_.device);
addPaged_((int)n, x, ids ? ids : generatedIds.data());
}
void GpuIndex::addPaged_(int n, const float* x, const Index::idx_t* ids) {
if (n > 0) {
size_t totalSize = (size_t)n * this->d * sizeof(float);
if (totalSize > kAddPageSize || n > kAddVecSize) {
// How many vectors fit into kAddPageSize?
size_t maxNumVecsForPageSize =
kAddPageSize / ((size_t)this->d * sizeof(float));
// Always add at least 1 vector, if we have huge vectors
maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (size_t)1);
size_t tileSize = std::min((size_t)n, maxNumVecsForPageSize);
tileSize = std::min(tileSize, kSearchVecSize);
for (size_t i = 0; i < (size_t)n; i += tileSize) {
size_t curNum = std::min(tileSize, n - i);
addPage_(
curNum,
x + i * (size_t)this->d,
ids ? ids + i : nullptr);
}
} else {
addPage_(n, x, ids);
}
}
}
void GpuIndex::addPage_(int n, const float* x, const Index::idx_t* ids) {
// At this point, `x` can be resident on CPU or GPU, and `ids` may be
// resident on CPU, GPU or may be null.
//
// Before continuing, we guarantee that all data will be resident on the
// GPU.
auto stream = resources_->getDefaultStreamCurrentDevice();
auto vecs = toDeviceTemporary<float, 2>(
resources_.get(),
config_.device,
const_cast<float*>(x),
stream,
{n, this->d});
if (ids) {
auto indices = toDeviceTemporary<Index::idx_t, 1>(
resources_.get(),
config_.device,
const_cast<Index::idx_t*>(ids),
stream,
{n});
addImpl_(n, vecs.data(), ids ? indices.data() : nullptr);
} else {
addImpl_(n, vecs.data(), nullptr);
}
}
void GpuIndex::assign(
Index::idx_t n,
const float* x,
Index::idx_t* labels,
Index::idx_t k) const {
FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
// For now, only support <= max int results
FAISS_THROW_IF_NOT_FMT(
n <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %d indices",
std::numeric_limits<int>::max());
// Maximum k-selection supported is based on the CUDA SDK
FAISS_THROW_IF_NOT_FMT(
k <= (Index::idx_t)getMaxKSelection(),
"GPU index only supports k <= %d (requested %d)",
getMaxKSelection(),
(int)k); // select limitation
DeviceScope scope(config_.device);
auto stream = resources_->getDefaultStream(config_.device);
// We need to create a throw-away buffer for distances, which we don't use
// but which we do need for the search call
DeviceTensor<float, 2, true> distances(
resources_.get(),
makeTempAlloc(AllocType::Other, stream),
{(int)n, (int)k});
// Forward to search
search(n, x, k, distances.data(), labels);
}
void GpuIndex::search(
Index::idx_t n,
const float* x,
Index::idx_t k,
float* distances,
Index::idx_t* labels) const {
FAISS_THROW_IF_NOT(k > 0);
FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
// For now, only support <= max int results
FAISS_THROW_IF_NOT_FMT(
n <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %d indices",
std::numeric_limits<int>::max());
// Maximum k-selection supported is based on the CUDA SDK
FAISS_THROW_IF_NOT_FMT(
k <= (Index::idx_t)getMaxKSelection(),
"GPU index only supports k <= %d (requested %d)",
getMaxKSelection(),
(int)k); // select limitation
if (n == 0 || k == 0) {
// nothing to search
return;
}
DeviceScope scope(config_.device);
auto stream = resources_->getDefaultStream(config_.device);
// We guarantee that the searchImpl_ will be called with device-resident
// pointers.
// The input vectors may be too large for the GPU, but we still
// assume that the output distances and labels are not.
// Go ahead and make space for output distances and labels on the
// GPU.
// If we reach a point where all inputs are too big, we can add
// another level of tiling.
auto outDistances = toDeviceTemporary<float, 2>(
resources_.get(),
config_.device,
distances,
stream,
{(int)n, (int)k});
auto outLabels = toDeviceTemporary<Index::idx_t, 2>(
resources_.get(), config_.device, labels, stream, {(int)n, (int)k});
bool usePaged = false;
if (getDeviceForAddress(x) == -1) {
// It is possible that the user is querying for a vector set size
// `x` that won't fit on the GPU.
// In this case, we will have to handle paging of the data from CPU
// -> GPU.
// Currently, we don't handle the case where the output data won't
// fit on the GPU (e.g., n * k is too large for the GPU memory).
size_t dataSize = (size_t)n * this->d * sizeof(float);
if (dataSize >= minPagedSize_) {
searchFromCpuPaged_(n, x, k, outDistances.data(), outLabels.data());
usePaged = true;
}
}
if (!usePaged) {
searchNonPaged_(n, x, k, outDistances.data(), outLabels.data());
}
// Copy back if necessary
fromDevice<float, 2>(outDistances, distances, stream);
fromDevice<Index::idx_t, 2>(outLabels, labels, stream);
}
void GpuIndex::searchNonPaged_(
int n,
const float* x,
int k,
float* outDistancesData,
Index::idx_t* outIndicesData) const {
auto stream = resources_->getDefaultStream(config_.device);
// Make sure arguments are on the device we desire; use temporary
// memory allocations to move it if necessary
auto vecs = toDeviceTemporary<float, 2>(
resources_.get(),
config_.device,
const_cast<float*>(x),
stream,
{n, (int)this->d});
searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData);
}
void GpuIndex::searchFromCpuPaged_(
int n,
const float* x,
int k,
float* outDistancesData,
Index::idx_t* outIndicesData) const {
Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});
// Is pinned memory available?
auto pinnedAlloc = resources_->getPinnedMemory();
int pageSizeInVecs =
(int)((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
if (!pinnedAlloc.first || pageSizeInVecs < 1) {
// Just page without overlapping copy with compute
int batchSize = utils::nextHighestPowerOf2(
(int)((size_t)kNonPinnedPageSize / (sizeof(float) * this->d)));
for (int cur = 0; cur < n; cur += batchSize) {
int num = std::min(batchSize, n - cur);
auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
searchNonPaged_(
num,
x + (size_t)cur * this->d,
k,
outDistancesSlice.data(),
outIndicesSlice.data());
}
return;
}
//
// Pinned memory is available, so we can overlap copy with compute.
// We use two pinned memory buffers, and triple-buffer the
// procedure:
//
// 1 CPU copy -> pinned
// 2 pinned copy -> GPU
// 3 GPU compute
//
// 1 2 3 1 2 3 ... (pinned buf A)
// 1 2 3 1 2 ... (pinned buf B)
// 1 2 3 1 ... (pinned buf A)
// time ->
//
auto defaultStream = resources_->getDefaultStream(config_.device);
auto copyStream = resources_->getAsyncCopyStream(config_.device);
FAISS_ASSERT(
(size_t)pageSizeInVecs * this->d <=
(size_t)std::numeric_limits<int>::max());
float* bufPinnedA = (float*)pinnedAlloc.first;
float* bufPinnedB = bufPinnedA + (size_t)pageSizeInVecs * this->d;
float* bufPinned[2] = {bufPinnedA, bufPinnedB};
// Reserve space on the GPU for the destination of the pinned buffer
// copy
DeviceTensor<float, 2, true> bufGpuA(
resources_.get(),
makeTempAlloc(AllocType::Other, defaultStream),
{(int)pageSizeInVecs, (int)this->d});
DeviceTensor<float, 2, true> bufGpuB(
resources_.get(),
makeTempAlloc(AllocType::Other, defaultStream),
{(int)pageSizeInVecs, (int)this->d});
DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
// Copy completion events for the pinned buffers
std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
// Execute completion events for the GPU buffers
std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
// All offsets are in terms of number of vectors; they remain within
// int bounds (as this function only handles max in vectors)
// Current start offset for buffer 1
int cur1 = 0;
int cur1BufIndex = 0;
// Current start offset for buffer 2
int cur2 = -1;
int cur2BufIndex = 0;
// Current start offset for buffer 3
int cur3 = -1;
int cur3BufIndex = 0;
while (cur3 < n) {
// Start async pinned -> GPU copy first (buf 2)
if (cur2 != -1 && cur2 < n) {
// Copy pinned to GPU
int numToCopy = std::min(pageSizeInVecs, n - cur2);
// Make sure any previous execution has completed before continuing
auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
if (eventPrev.get()) {
eventPrev->streamWaitOnEvent(copyStream);
}
CUDA_VERIFY(hipMemcpyAsync(
bufGpus[cur2BufIndex]->data(),
bufPinned[cur2BufIndex],
(size_t)numToCopy * this->d * sizeof(float),
hipMemcpyHostToDevice,
copyStream));
// Mark a completion event in this stream
eventPinnedCopyDone[cur2BufIndex].reset(new CudaEvent(copyStream));
// We pick up from here
cur3 = cur2;
cur2 += numToCopy;
cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
}
if (cur3 != -1 && cur3 < n) {
// Process on GPU
int numToProcess = std::min(pageSizeInVecs, n - cur3);
// Make sure the previous copy has completed before continuing
auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
FAISS_ASSERT(eventPrev.get());
eventPrev->streamWaitOnEvent(defaultStream);
// Create tensor wrappers
// DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
// {numToProcess, this->d});
auto outDistancesSlice =
outDistances.narrowOutermost(cur3, numToProcess);
auto outIndicesSlice =
outIndices.narrowOutermost(cur3, numToProcess);
searchImpl_(
numToProcess,
bufGpus[cur3BufIndex]->data(),
k,
outDistancesSlice.data(),
outIndicesSlice.data());
// Create completion event
eventGpuExecuteDone[cur3BufIndex].reset(
new CudaEvent(defaultStream));
// We pick up from here
cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
cur3 += numToProcess;
}
if (cur1 < n) {
// Copy CPU mem to CPU pinned
int numToCopy = std::min(pageSizeInVecs, n - cur1);
// Make sure any previous copy has completed before continuing
auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
if (eventPrev.get()) {
eventPrev->cpuWaitOnEvent();
}
memcpy(bufPinned[cur1BufIndex],
x + (size_t)cur1 * this->d,
(size_t)numToCopy * this->d * sizeof(float));
// We pick up from here
cur2 = cur1;
cur1 += numToCopy;
cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
}
}
}
void GpuIndex::compute_residual(
const float* x,
float* residual,
Index::idx_t key) const {
FAISS_THROW_MSG("compute_residual not implemented for this type of index");
}
void GpuIndex::compute_residual_n(
Index::idx_t n,
const float* xs,
float* residuals,
const Index::idx_t* keys) const {
FAISS_THROW_MSG(
"compute_residual_n not implemented for this type of index");
}
std::shared_ptr<GpuResources> GpuIndex::getResources() {
return resources_;
}
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/GpuIndex.h>
#include <faiss/gpu/GpuResources.h>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/utils/StaticUtils.h>
#include <faiss/impl/FaissAssert.h>
#include <faiss/gpu/utils/CopyUtils.cuh>
#include <algorithm>
#include <limits>
#include <memory>
namespace faiss {
namespace gpu {
/// Default CPU search size for which we use paged copies
constexpr size_t kMinPageSize = (size_t)256 * 1024 * 1024;
/// Size above which we page copies from the CPU to GPU (non-paged
/// memory usage)
constexpr size_t kNonPinnedPageSize = (size_t)256 * 1024 * 1024;
// Default size for which we page add or search
constexpr size_t kAddPageSize = (size_t)256 * 1024 * 1024;
// Or, maximum number of vectors to consider per page of add or search
constexpr size_t kAddVecSize = (size_t)512 * 1024;
// Use a smaller search size, as precomputed code usage on IVFPQ
// requires substantial amounts of memory
// FIXME: parameterize based on algorithm need
constexpr size_t kSearchVecSize = (size_t)32 * 1024;
GpuIndex::GpuIndex(
std::shared_ptr<GpuResources> resources,
int dims,
faiss::MetricType metric,
float metricArg,
GpuIndexConfig config)
: Index(dims, metric),
resources_(resources),
config_(config),
minPagedSize_(kMinPageSize) {
FAISS_THROW_IF_NOT_FMT(
config_.device < getNumDevices(),
"Invalid GPU device %d",
config_.device);
FAISS_THROW_IF_NOT_MSG(dims > 0, "Invalid number of dimensions");
FAISS_THROW_IF_NOT_FMT(
config_.memorySpace == MemorySpace::Device ||
(config_.memorySpace == MemorySpace::Unified &&
getFullUnifiedMemSupport(config_.device)),
"Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",
config.device);
metric_arg = metricArg;
FAISS_ASSERT((bool)resources_);
resources_->initializeForDevice(config_.device);
}
int GpuIndex::getDevice() const {
return config_.device;
}
void GpuIndex::copyFrom(const faiss::Index* index) {
d = index->d;
metric_type = index->metric_type;
metric_arg = index->metric_arg;
ntotal = index->ntotal;
is_trained = index->is_trained;
}
void GpuIndex::copyTo(faiss::Index* index) const {
index->d = d;
index->metric_type = metric_type;
index->metric_arg = metric_arg;
index->ntotal = ntotal;
index->is_trained = is_trained;
}
void GpuIndex::setMinPagingSize(size_t size) {
minPagedSize_ = size;
}
size_t GpuIndex::getMinPagingSize() const {
return minPagedSize_;
}
void GpuIndex::add(Index::idx_t n, const float* x) {
// Pass to add_with_ids
add_with_ids(n, x, nullptr);
}
void GpuIndex::add_with_ids(
Index::idx_t n,
const float* x,
const Index::idx_t* ids) {
FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
// For now, only support <= max int results
FAISS_THROW_IF_NOT_FMT(
n <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %d indices",
std::numeric_limits<int>::max());
if (n == 0) {
// nothing to add
return;
}
std::vector<Index::idx_t> generatedIds;
// Generate IDs if we need them
if (!ids && addImplRequiresIDs_()) {
generatedIds = std::vector<Index::idx_t>(n);
for (Index::idx_t i = 0; i < n; ++i) {
generatedIds[i] = this->ntotal + i;
}
}
DeviceScope scope(config_.device);
addPaged_((int)n, x, ids ? ids : generatedIds.data());
}
void GpuIndex::addPaged_(int n, const float* x, const Index::idx_t* ids) {
if (n > 0) {
size_t totalSize = (size_t)n * this->d * sizeof(float);
if (totalSize > kAddPageSize || n > kAddVecSize) {
// How many vectors fit into kAddPageSize?
size_t maxNumVecsForPageSize =
kAddPageSize / ((size_t)this->d * sizeof(float));
// Always add at least 1 vector, if we have huge vectors
maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (size_t)1);
size_t tileSize = std::min((size_t)n, maxNumVecsForPageSize);
tileSize = std::min(tileSize, kSearchVecSize);
for (size_t i = 0; i < (size_t)n; i += tileSize) {
size_t curNum = std::min(tileSize, n - i);
addPage_(
curNum,
x + i * (size_t)this->d,
ids ? ids + i : nullptr);
}
} else {
addPage_(n, x, ids);
}
}
}
void GpuIndex::addPage_(int n, const float* x, const Index::idx_t* ids) {
// At this point, `x` can be resident on CPU or GPU, and `ids` may be
// resident on CPU, GPU or may be null.
//
// Before continuing, we guarantee that all data will be resident on the
// GPU.
auto stream = resources_->getDefaultStreamCurrentDevice();
auto vecs = toDeviceTemporary<float, 2>(
resources_.get(),
config_.device,
const_cast<float*>(x),
stream,
{n, this->d});
if (ids) {
auto indices = toDeviceTemporary<Index::idx_t, 1>(
resources_.get(),
config_.device,
const_cast<Index::idx_t*>(ids),
stream,
{n});
addImpl_(n, vecs.data(), ids ? indices.data() : nullptr);
} else {
addImpl_(n, vecs.data(), nullptr);
}
}
void GpuIndex::assign(
Index::idx_t n,
const float* x,
Index::idx_t* labels,
Index::idx_t k) const {
FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
// For now, only support <= max int results
FAISS_THROW_IF_NOT_FMT(
n <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %d indices",
std::numeric_limits<int>::max());
// Maximum k-selection supported is based on the CUDA SDK
FAISS_THROW_IF_NOT_FMT(
k <= (Index::idx_t)getMaxKSelection(),
"GPU index only supports k <= %d (requested %d)",
getMaxKSelection(),
(int)k); // select limitation
DeviceScope scope(config_.device);
auto stream = resources_->getDefaultStream(config_.device);
// We need to create a throw-away buffer for distances, which we don't use
// but which we do need for the search call
DeviceTensor<float, 2, true> distances(
resources_.get(),
makeTempAlloc(AllocType::Other, stream),
{(int)n, (int)k});
// Forward to search
search(n, x, k, distances.data(), labels);
}
void GpuIndex::search(
Index::idx_t n,
const float* x,
Index::idx_t k,
float* distances,
Index::idx_t* labels) const {
FAISS_THROW_IF_NOT(k > 0);
FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");
// For now, only support <= max int results
FAISS_THROW_IF_NOT_FMT(
n <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %d indices",
std::numeric_limits<int>::max());
// Maximum k-selection supported is based on the CUDA SDK
FAISS_THROW_IF_NOT_FMT(
k <= (Index::idx_t)getMaxKSelection(),
"GPU index only supports k <= %d (requested %d)",
getMaxKSelection(),
(int)k); // select limitation
if (n == 0 || k == 0) {
// nothing to search
return;
}
DeviceScope scope(config_.device);
auto stream = resources_->getDefaultStream(config_.device);
// We guarantee that the searchImpl_ will be called with device-resident
// pointers.
// The input vectors may be too large for the GPU, but we still
// assume that the output distances and labels are not.
// Go ahead and make space for output distances and labels on the
// GPU.
// If we reach a point where all inputs are too big, we can add
// another level of tiling.
auto outDistances = toDeviceTemporary<float, 2>(
resources_.get(),
config_.device,
distances,
stream,
{(int)n, (int)k});
auto outLabels = toDeviceTemporary<Index::idx_t, 2>(
resources_.get(), config_.device, labels, stream, {(int)n, (int)k});
bool usePaged = false;
if (getDeviceForAddress(x) == -1) {
// It is possible that the user is querying for a vector set size
// `x` that won't fit on the GPU.
// In this case, we will have to handle paging of the data from CPU
// -> GPU.
// Currently, we don't handle the case where the output data won't
// fit on the GPU (e.g., n * k is too large for the GPU memory).
size_t dataSize = (size_t)n * this->d * sizeof(float);
if (dataSize >= minPagedSize_) {
searchFromCpuPaged_(n, x, k, outDistances.data(), outLabels.data());
usePaged = true;
}
}
if (!usePaged) {
searchNonPaged_(n, x, k, outDistances.data(), outLabels.data());
}
// Copy back if necessary
fromDevice<float, 2>(outDistances, distances, stream);
fromDevice<Index::idx_t, 2>(outLabels, labels, stream);
}
void GpuIndex::searchNonPaged_(
int n,
const float* x,
int k,
float* outDistancesData,
Index::idx_t* outIndicesData) const {
auto stream = resources_->getDefaultStream(config_.device);
// Make sure arguments are on the device we desire; use temporary
// memory allocations to move it if necessary
auto vecs = toDeviceTemporary<float, 2>(
resources_.get(),
config_.device,
const_cast<float*>(x),
stream,
{n, (int)this->d});
searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData);
}
void GpuIndex::searchFromCpuPaged_(
int n,
const float* x,
int k,
float* outDistancesData,
Index::idx_t* outIndicesData) const {
Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});
// Is pinned memory available?
auto pinnedAlloc = resources_->getPinnedMemory();
int pageSizeInVecs =
(int)((pinnedAlloc.second / 2) / (sizeof(float) * this->d));
if (!pinnedAlloc.first || pageSizeInVecs < 1) {
// Just page without overlapping copy with compute
int batchSize = utils::nextHighestPowerOf2(
(int)((size_t)kNonPinnedPageSize / (sizeof(float) * this->d)));
for (int cur = 0; cur < n; cur += batchSize) {
int num = std::min(batchSize, n - cur);
auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
searchNonPaged_(
num,
x + (size_t)cur * this->d,
k,
outDistancesSlice.data(),
outIndicesSlice.data());
}
return;
}
//
// Pinned memory is available, so we can overlap copy with compute.
// We use two pinned memory buffers, and triple-buffer the
// procedure:
//
// 1 CPU copy -> pinned
// 2 pinned copy -> GPU
// 3 GPU compute
//
// 1 2 3 1 2 3 ... (pinned buf A)
// 1 2 3 1 2 ... (pinned buf B)
// 1 2 3 1 ... (pinned buf A)
// time ->
//
auto defaultStream = resources_->getDefaultStream(config_.device);
auto copyStream = resources_->getAsyncCopyStream(config_.device);
FAISS_ASSERT(
(size_t)pageSizeInVecs * this->d <=
(size_t)std::numeric_limits<int>::max());
float* bufPinnedA = (float*)pinnedAlloc.first;
float* bufPinnedB = bufPinnedA + (size_t)pageSizeInVecs * this->d;
float* bufPinned[2] = {bufPinnedA, bufPinnedB};
// Reserve space on the GPU for the destination of the pinned buffer
// copy
DeviceTensor<float, 2, true> bufGpuA(
resources_.get(),
makeTempAlloc(AllocType::Other, defaultStream),
{(int)pageSizeInVecs, (int)this->d});
DeviceTensor<float, 2, true> bufGpuB(
resources_.get(),
makeTempAlloc(AllocType::Other, defaultStream),
{(int)pageSizeInVecs, (int)this->d});
DeviceTensor<float, 2, true>* bufGpus[2] = {&bufGpuA, &bufGpuB};
// Copy completion events for the pinned buffers
std::unique_ptr<CudaEvent> eventPinnedCopyDone[2];
// Execute completion events for the GPU buffers
std::unique_ptr<CudaEvent> eventGpuExecuteDone[2];
// All offsets are in terms of number of vectors; they remain within
// int bounds (as this function only handles max in vectors)
// Current start offset for buffer 1
int cur1 = 0;
int cur1BufIndex = 0;
// Current start offset for buffer 2
int cur2 = -1;
int cur2BufIndex = 0;
// Current start offset for buffer 3
int cur3 = -1;
int cur3BufIndex = 0;
while (cur3 < n) {
// Start async pinned -> GPU copy first (buf 2)
if (cur2 != -1 && cur2 < n) {
// Copy pinned to GPU
int numToCopy = std::min(pageSizeInVecs, n - cur2);
// Make sure any previous execution has completed before continuing
auto& eventPrev = eventGpuExecuteDone[cur2BufIndex];
if (eventPrev.get()) {
eventPrev->streamWaitOnEvent(copyStream);
}
CUDA_VERIFY(cudaMemcpyAsync(
bufGpus[cur2BufIndex]->data(),
bufPinned[cur2BufIndex],
(size_t)numToCopy * this->d * sizeof(float),
cudaMemcpyHostToDevice,
copyStream));
// Mark a completion event in this stream
eventPinnedCopyDone[cur2BufIndex].reset(new CudaEvent(copyStream));
// We pick up from here
cur3 = cur2;
cur2 += numToCopy;
cur2BufIndex = (cur2BufIndex == 0) ? 1 : 0;
}
if (cur3 != -1 && cur3 < n) {
// Process on GPU
int numToProcess = std::min(pageSizeInVecs, n - cur3);
// Make sure the previous copy has completed before continuing
auto& eventPrev = eventPinnedCopyDone[cur3BufIndex];
FAISS_ASSERT(eventPrev.get());
eventPrev->streamWaitOnEvent(defaultStream);
// Create tensor wrappers
// DeviceTensor<float, 2, true> input(bufGpus[cur3BufIndex]->data(),
// {numToProcess, this->d});
auto outDistancesSlice =
outDistances.narrowOutermost(cur3, numToProcess);
auto outIndicesSlice =
outIndices.narrowOutermost(cur3, numToProcess);
searchImpl_(
numToProcess,
bufGpus[cur3BufIndex]->data(),
k,
outDistancesSlice.data(),
outIndicesSlice.data());
// Create completion event
eventGpuExecuteDone[cur3BufIndex].reset(
new CudaEvent(defaultStream));
// We pick up from here
cur3BufIndex = (cur3BufIndex == 0) ? 1 : 0;
cur3 += numToProcess;
}
if (cur1 < n) {
// Copy CPU mem to CPU pinned
int numToCopy = std::min(pageSizeInVecs, n - cur1);
// Make sure any previous copy has completed before continuing
auto& eventPrev = eventPinnedCopyDone[cur1BufIndex];
if (eventPrev.get()) {
eventPrev->cpuWaitOnEvent();
}
memcpy(bufPinned[cur1BufIndex],
x + (size_t)cur1 * this->d,
(size_t)numToCopy * this->d * sizeof(float));
// We pick up from here
cur2 = cur1;
cur1 += numToCopy;
cur1BufIndex = (cur1BufIndex == 0) ? 1 : 0;
}
}
}
void GpuIndex::compute_residual(
const float* x,
float* residual,
Index::idx_t key) const {
FAISS_THROW_MSG("compute_residual not implemented for this type of index");
}
void GpuIndex::compute_residual_n(
Index::idx_t n,
const float* xs,
float* residuals,
const Index::idx_t* keys) const {
FAISS_THROW_MSG(
"compute_residual_n not implemented for this type of index");
}
std::shared_ptr<GpuResources> GpuIndex::getResources() {
return resources_;
}
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <faiss/Index.h>
#include <faiss/gpu/GpuResources.h>
namespace faiss {
namespace gpu {
struct GpuIndexConfig {
inline GpuIndexConfig() : device(0), memorySpace(MemorySpace::Device) {}
/// GPU device on which the index is resident
int device;
/// What memory space to use for primary storage.
/// On Pascal and above (CC 6+) architectures, allows GPUs to use
/// more memory than is available on the GPU.
MemorySpace memorySpace;
};
class GpuIndex : public faiss::Index {
public:
GpuIndex(
std::shared_ptr<GpuResources> resources,
int dims,
faiss::MetricType metric,
float metricArg,
GpuIndexConfig config);
/// Returns the device that this index is resident on
int getDevice() const;
/// Returns a reference to our GpuResources object that manages memory,
/// stream and handle resources on the GPU
std::shared_ptr<GpuResources> getResources();
/// Set the minimum data size for searches (in MiB) for which we use
/// CPU -> GPU paging
void setMinPagingSize(size_t size);
/// Returns the current minimum data size for paged searches
size_t getMinPagingSize() const;
/// `x` can be resident on the CPU or any GPU; copies are performed
/// as needed
/// Handles paged adds if the add set is too large; calls addInternal_
void add(Index::idx_t, const float* x) override;
/// `x` and `ids` can be resident on the CPU or any GPU; copies are
/// performed as needed
/// Handles paged adds if the add set is too large; calls addInternal_
void add_with_ids(Index::idx_t n, const float* x, const Index::idx_t* ids)
override;
/// `x` and `labels` can be resident on the CPU or any GPU; copies are
/// performed as needed
void assign(
Index::idx_t n,
const float* x,
Index::idx_t* labels,
Index::idx_t k = 1) const override;
/// `x`, `distances` and `labels` can be resident on the CPU or any
/// GPU; copies are performed as needed
void search(
Index::idx_t n,
const float* x,
Index::idx_t k,
float* distances,
Index::idx_t* labels) const override;
/// Overridden to force GPU indices to provide their own GPU-friendly
/// implementation
void compute_residual(const float* x, float* residual, Index::idx_t key)
const override;
/// Overridden to force GPU indices to provide their own GPU-friendly
/// implementation
void compute_residual_n(
Index::idx_t n,
const float* xs,
float* residuals,
const Index::idx_t* keys) const override;
protected:
/// Copy what we need from the CPU equivalent
void copyFrom(const faiss::Index* index);
/// Copy what we have to the CPU equivalent
void copyTo(faiss::Index* index) const;
/// Does addImpl_ require IDs? If so, and no IDs are provided, we will
/// generate them sequentially based on the order in which the IDs are added
virtual bool addImplRequiresIDs_() const = 0;
/// Overridden to actually perform the add
/// All data is guaranteed to be resident on our device
virtual void addImpl_(int n, const float* x, const Index::idx_t* ids) = 0;
/// Overridden to actually perform the search
/// All data is guaranteed to be resident on our device
virtual void searchImpl_(
int n,
const float* x,
int k,
float* distances,
Index::idx_t* labels) const = 0;
private:
/// Handles paged adds if the add set is too large, passes to
/// addImpl_ to actually perform the add for the current page
void addPaged_(int n, const float* x, const Index::idx_t* ids);
/// Calls addImpl_ for a single page of GPU-resident data
void addPage_(int n, const float* x, const Index::idx_t* ids);
/// Calls searchImpl_ for a single page of GPU-resident data
void searchNonPaged_(
int n,
const float* x,
int k,
float* outDistancesData,
Index::idx_t* outIndicesData) const;
/// Calls searchImpl_ for a single page of GPU-resident data,
/// handling paging of the data and copies from the CPU
void searchFromCpuPaged_(
int n,
const float* x,
int k,
float* outDistancesData,
Index::idx_t* outIndicesData) const;
protected:
/// Manages streams, cuBLAS handles and scratch memory for devices
std::shared_ptr<GpuResources> resources_;
/// Our configuration options
const GpuIndexConfig config_;
/// Size above which we page copies from the CPU to GPU
size_t minPagedSize_;
};
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <faiss/Index.h>
#include <faiss/gpu/GpuResources.h>
namespace faiss {
namespace gpu {
struct GpuIndexConfig {
inline GpuIndexConfig() : device(0), memorySpace(MemorySpace::Device) {}
/// GPU device on which the index is resident
int device;
/// What memory space to use for primary storage.
/// On Pascal and above (CC 6+) architectures, allows GPUs to use
/// more memory than is available on the GPU.
MemorySpace memorySpace;
};
class GpuIndex : public faiss::Index {
public:
GpuIndex(
std::shared_ptr<GpuResources> resources,
int dims,
faiss::MetricType metric,
float metricArg,
GpuIndexConfig config);
/// Returns the device that this index is resident on
int getDevice() const;
/// Returns a reference to our GpuResources object that manages memory,
/// stream and handle resources on the GPU
std::shared_ptr<GpuResources> getResources();
/// Set the minimum data size for searches (in MiB) for which we use
/// CPU -> GPU paging
void setMinPagingSize(size_t size);
/// Returns the current minimum data size for paged searches
size_t getMinPagingSize() const;
/// `x` can be resident on the CPU or any GPU; copies are performed
/// as needed
/// Handles paged adds if the add set is too large; calls addInternal_
void add(Index::idx_t, const float* x) override;
/// `x` and `ids` can be resident on the CPU or any GPU; copies are
/// performed as needed
/// Handles paged adds if the add set is too large; calls addInternal_
void add_with_ids(Index::idx_t n, const float* x, const Index::idx_t* ids)
override;
/// `x` and `labels` can be resident on the CPU or any GPU; copies are
/// performed as needed
void assign(
Index::idx_t n,
const float* x,
Index::idx_t* labels,
Index::idx_t k = 1) const override;
/// `x`, `distances` and `labels` can be resident on the CPU or any
/// GPU; copies are performed as needed
void search(
Index::idx_t n,
const float* x,
Index::idx_t k,
float* distances,
Index::idx_t* labels) const override;
/// Overridden to force GPU indices to provide their own GPU-friendly
/// implementation
void compute_residual(const float* x, float* residual, Index::idx_t key)
const override;
/// Overridden to force GPU indices to provide their own GPU-friendly
/// implementation
void compute_residual_n(
Index::idx_t n,
const float* xs,
float* residuals,
const Index::idx_t* keys) const override;
protected:
/// Copy what we need from the CPU equivalent
void copyFrom(const faiss::Index* index);
/// Copy what we have to the CPU equivalent
void copyTo(faiss::Index* index) const;
/// Does addImpl_ require IDs? If so, and no IDs are provided, we will
/// generate them sequentially based on the order in which the IDs are added
virtual bool addImplRequiresIDs_() const = 0;
/// Overridden to actually perform the add
/// All data is guaranteed to be resident on our device
virtual void addImpl_(int n, const float* x, const Index::idx_t* ids) = 0;
/// Overridden to actually perform the search
/// All data is guaranteed to be resident on our device
virtual void searchImpl_(
int n,
const float* x,
int k,
float* distances,
Index::idx_t* labels) const = 0;
private:
/// Handles paged adds if the add set is too large, passes to
/// addImpl_ to actually perform the add for the current page
void addPaged_(int n, const float* x, const Index::idx_t* ids);
/// Calls addImpl_ for a single page of GPU-resident data
void addPage_(int n, const float* x, const Index::idx_t* ids);
/// Calls searchImpl_ for a single page of GPU-resident data
void searchNonPaged_(
int n,
const float* x,
int k,
float* outDistancesData,
Index::idx_t* outIndicesData) const;
/// Calls searchImpl_ for a single page of GPU-resident data,
/// handling paging of the data and copies from the CPU
void searchFromCpuPaged_(
int n,
const float* x,
int k,
float* outDistancesData,
Index::idx_t* outIndicesData) const;
protected:
/// Manages streams, cuBLAS handles and scratch memory for devices
std::shared_ptr<GpuResources> resources_;
/// Our configuration options
const GpuIndexConfig config_;
/// Size above which we page copies from the CPU to GPU
size_t minPagedSize_;
};
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/GpuIndexBinaryFlat.h>
#include <faiss/gpu/GpuResources.h>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/impl/BinaryFlatIndex.cuh>
#include <faiss/gpu/utils/ConversionOperators.cuh>
#include <faiss/gpu/utils/CopyUtils.cuh>
namespace faiss {
namespace gpu {
/// Default CPU search size for which we use paged copies
constexpr size_t kMinPageSize = (size_t)256 * 1024 * 1024;
GpuIndexBinaryFlat::GpuIndexBinaryFlat(
GpuResourcesProvider* provider,
const faiss::IndexBinaryFlat* index,
GpuIndexBinaryFlatConfig config)
: IndexBinary(index->d),
resources_(provider->getResources()),
binaryFlatConfig_(config) {
FAISS_THROW_IF_NOT_FMT(
this->d % 8 == 0,
"vector dimension (number of bits) "
"must be divisible by 8 (passed %d)",
this->d);
// Flat index doesn't need training
this->is_trained = true;
copyFrom(index);
}
GpuIndexBinaryFlat::GpuIndexBinaryFlat(
GpuResourcesProvider* provider,
int dims,
GpuIndexBinaryFlatConfig config)
: IndexBinary(dims),
resources_(provider->getResources()),
binaryFlatConfig_(std::move(config)) {
FAISS_THROW_IF_NOT_FMT(
this->d % 8 == 0,
"vector dimension (number of bits) "
"must be divisible by 8 (passed %d)",
this->d);
// Flat index doesn't need training
this->is_trained = true;
// Construct index
DeviceScope scope(binaryFlatConfig_.device);
data_.reset(new BinaryFlatIndex(
resources_.get(), this->d, binaryFlatConfig_.memorySpace));
}
GpuIndexBinaryFlat::~GpuIndexBinaryFlat() {}
int GpuIndexBinaryFlat::getDevice() const {
return binaryFlatConfig_.device;
}
std::shared_ptr<GpuResources> GpuIndexBinaryFlat::getResources() {
return resources_;
}
void GpuIndexBinaryFlat::copyFrom(const faiss::IndexBinaryFlat* index) {
DeviceScope scope(binaryFlatConfig_.device);
this->d = index->d;
// GPU code has 32 bit indices
FAISS_THROW_IF_NOT_FMT(
index->ntotal <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %zu indices; "
"attempting to copy CPU index with %zu parameters",
(size_t)std::numeric_limits<int>::max(),
(size_t)index->ntotal);
this->ntotal = index->ntotal;
// destroy old first before allocating new
data_.reset();
data_.reset(new BinaryFlatIndex(
resources_.get(), this->d, binaryFlatConfig_.memorySpace));
// The index could be empty
if (index->ntotal > 0) {
data_->add(
index->xb.data(),
index->ntotal,
resources_->getDefaultStream(binaryFlatConfig_.device));
}
}
void GpuIndexBinaryFlat::copyTo(faiss::IndexBinaryFlat* index) const {
DeviceScope scope(binaryFlatConfig_.device);
index->d = this->d;
index->ntotal = this->ntotal;
FAISS_ASSERT(data_);
FAISS_ASSERT(data_->getSize() == this->ntotal);
index->xb.resize(this->ntotal * (this->d / 8));
if (this->ntotal > 0) {
fromDevice(
data_->getVectorsRef(),
index->xb.data(),
resources_->getDefaultStream(binaryFlatConfig_.device));
}
}
void GpuIndexBinaryFlat::add(faiss::IndexBinary::idx_t n, const uint8_t* x) {
DeviceScope scope(binaryFlatConfig_.device);
// To avoid multiple re-allocations, ensure we have enough storage
// available
data_->reserve(n, resources_->getDefaultStream(binaryFlatConfig_.device));
// Due to GPU indexing in int32, we can't store more than this
// number of vectors on a GPU
FAISS_THROW_IF_NOT_FMT(
this->ntotal + n <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %zu indices",
(size_t)std::numeric_limits<int>::max());
data_->add(
(const unsigned char*)x,
n,
resources_->getDefaultStream(binaryFlatConfig_.device));
this->ntotal += n;
}
void GpuIndexBinaryFlat::reset() {
DeviceScope scope(binaryFlatConfig_.device);
// Free the underlying memory
data_->reset();
this->ntotal = 0;
}
void GpuIndexBinaryFlat::search(
faiss::IndexBinary::idx_t n,
const uint8_t* x,
faiss::IndexBinary::idx_t k,
int32_t* distances,
faiss::IndexBinary::idx_t* labels) const {
if (n == 0) {
return;
}
FAISS_THROW_IF_NOT(k > 0);
// For now, only support <= max int results
FAISS_THROW_IF_NOT_FMT(
n <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %zu indices",
(size_t)std::numeric_limits<int>::max());
FAISS_THROW_IF_NOT_FMT(
k <= (Index::idx_t)getMaxKSelection(),
"GPU only supports k <= %d (requested %d)",
getMaxKSelection(),
(int)k); // select limitation
DeviceScope scope(binaryFlatConfig_.device);
auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
// The input vectors may be too large for the GPU, but we still
// assume that the output distances and labels are not.
// Go ahead and make space for output distances and labels on the
// GPU.
// If we reach a point where all inputs are too big, we can add
// another level of tiling.
auto outDistances = toDeviceTemporary<int32_t, 2>(
resources_.get(),
binaryFlatConfig_.device,
distances,
stream,
{(int)n, (int)k});
// FlatIndex only supports an interface returning int indices
DeviceTensor<int, 2, true> outIntIndices(
resources_.get(),
makeTempAlloc(AllocType::Other, stream),
{(int)n, (int)k});
bool usePaged = false;
if (getDeviceForAddress(x) == -1) {
// It is possible that the user is querying for a vector set size
// `x` that won't fit on the GPU.
// In this case, we will have to handle paging of the data from CPU
// -> GPU.
// Currently, we don't handle the case where the output data won't
// fit on the GPU (e.g., n * k is too large for the GPU memory).
size_t dataSize = (size_t)n * (this->d / 8) * sizeof(uint8_t);
if (dataSize >= kMinPageSize) {
searchFromCpuPaged_(
n, x, k, outDistances.data(), outIntIndices.data());
usePaged = true;
}
}
if (!usePaged) {
searchNonPaged_(n, x, k, outDistances.data(), outIntIndices.data());
}
// Convert and copy int indices out
auto outIndices = toDeviceTemporary<Index::idx_t, 2>(
resources_.get(),
binaryFlatConfig_.device,
labels,
stream,
{(int)n, (int)k});
// Convert int to idx_t
convertTensor<int, Index::idx_t, 2>(stream, outIntIndices, outIndices);
// Copy back if necessary
fromDevice<int32_t, 2>(outDistances, distances, stream);
fromDevice<Index::idx_t, 2>(outIndices, labels, stream);
}
void GpuIndexBinaryFlat::searchNonPaged_(
int n,
const uint8_t* x,
int k,
int32_t* outDistancesData,
int* outIndicesData) const {
Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
// Make sure arguments are on the device we desire; use temporary
// memory allocations to move it if necessary
auto vecs = toDeviceTemporary<uint8_t, 2>(
resources_.get(),
binaryFlatConfig_.device,
const_cast<uint8_t*>(x),
stream,
{n, (int)(this->d / 8)});
data_->query(vecs, k, outDistances, outIndices);
}
void GpuIndexBinaryFlat::searchFromCpuPaged_(
int n,
const uint8_t* x,
int k,
int32_t* outDistancesData,
int* outIndicesData) const {
Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
auto vectorSize = sizeof(uint8_t) * (this->d / 8);
// Just page without overlapping copy with compute (as GpuIndexFlat does)
int batchSize = utils::nextHighestPowerOf2(
(int)((size_t)kMinPageSize / vectorSize));
for (int cur = 0; cur < n; cur += batchSize) {
int num = std::min(batchSize, n - cur);
auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
searchNonPaged_(
num,
x + (size_t)cur * (this->d / 8),
k,
outDistancesSlice.data(),
outIndicesSlice.data());
}
}
void GpuIndexBinaryFlat::reconstruct(
faiss::IndexBinary::idx_t key,
uint8_t* out) const {
DeviceScope scope(binaryFlatConfig_.device);
FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
auto& vecs = data_->getVectorsRef();
auto vec = vecs[key];
fromDevice(vec.data(), out, vecs.getSize(1), stream);
}
} // namespace gpu
} // namespace faiss
/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <faiss/gpu/GpuIndexBinaryFlat.h>
#include <faiss/gpu/GpuResources.h>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/impl/BinaryFlatIndex.cuh>
#include <faiss/gpu/utils/ConversionOperators.cuh>
#include <faiss/gpu/utils/CopyUtils.cuh>
namespace faiss {
namespace gpu {
/// Default CPU search size for which we use paged copies
constexpr size_t kMinPageSize = (size_t)256 * 1024 * 1024;
GpuIndexBinaryFlat::GpuIndexBinaryFlat(
GpuResourcesProvider* provider,
const faiss::IndexBinaryFlat* index,
GpuIndexBinaryFlatConfig config)
: IndexBinary(index->d),
resources_(provider->getResources()),
binaryFlatConfig_(config) {
FAISS_THROW_IF_NOT_FMT(
this->d % 8 == 0,
"vector dimension (number of bits) "
"must be divisible by 8 (passed %d)",
this->d);
// Flat index doesn't need training
this->is_trained = true;
copyFrom(index);
}
GpuIndexBinaryFlat::GpuIndexBinaryFlat(
GpuResourcesProvider* provider,
int dims,
GpuIndexBinaryFlatConfig config)
: IndexBinary(dims),
resources_(provider->getResources()),
binaryFlatConfig_(std::move(config)) {
FAISS_THROW_IF_NOT_FMT(
this->d % 8 == 0,
"vector dimension (number of bits) "
"must be divisible by 8 (passed %d)",
this->d);
// Flat index doesn't need training
this->is_trained = true;
// Construct index
DeviceScope scope(binaryFlatConfig_.device);
data_.reset(new BinaryFlatIndex(
resources_.get(), this->d, binaryFlatConfig_.memorySpace));
}
GpuIndexBinaryFlat::~GpuIndexBinaryFlat() {}
int GpuIndexBinaryFlat::getDevice() const {
return binaryFlatConfig_.device;
}
std::shared_ptr<GpuResources> GpuIndexBinaryFlat::getResources() {
return resources_;
}
void GpuIndexBinaryFlat::copyFrom(const faiss::IndexBinaryFlat* index) {
DeviceScope scope(binaryFlatConfig_.device);
this->d = index->d;
// GPU code has 32 bit indices
FAISS_THROW_IF_NOT_FMT(
index->ntotal <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %zu indices; "
"attempting to copy CPU index with %zu parameters",
(size_t)std::numeric_limits<int>::max(),
(size_t)index->ntotal);
this->ntotal = index->ntotal;
// destroy old first before allocating new
data_.reset();
data_.reset(new BinaryFlatIndex(
resources_.get(), this->d, binaryFlatConfig_.memorySpace));
// The index could be empty
if (index->ntotal > 0) {
data_->add(
index->xb.data(),
index->ntotal,
resources_->getDefaultStream(binaryFlatConfig_.device));
}
}
void GpuIndexBinaryFlat::copyTo(faiss::IndexBinaryFlat* index) const {
DeviceScope scope(binaryFlatConfig_.device);
index->d = this->d;
index->ntotal = this->ntotal;
FAISS_ASSERT(data_);
FAISS_ASSERT(data_->getSize() == this->ntotal);
index->xb.resize(this->ntotal * (this->d / 8));
if (this->ntotal > 0) {
fromDevice(
data_->getVectorsRef(),
index->xb.data(),
resources_->getDefaultStream(binaryFlatConfig_.device));
}
}
void GpuIndexBinaryFlat::add(faiss::IndexBinary::idx_t n, const uint8_t* x) {
DeviceScope scope(binaryFlatConfig_.device);
// To avoid multiple re-allocations, ensure we have enough storage
// available
data_->reserve(n, resources_->getDefaultStream(binaryFlatConfig_.device));
// Due to GPU indexing in int32, we can't store more than this
// number of vectors on a GPU
FAISS_THROW_IF_NOT_FMT(
this->ntotal + n <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %zu indices",
(size_t)std::numeric_limits<int>::max());
data_->add(
(const unsigned char*)x,
n,
resources_->getDefaultStream(binaryFlatConfig_.device));
this->ntotal += n;
}
void GpuIndexBinaryFlat::reset() {
DeviceScope scope(binaryFlatConfig_.device);
// Free the underlying memory
data_->reset();
this->ntotal = 0;
}
void GpuIndexBinaryFlat::search(
faiss::IndexBinary::idx_t n,
const uint8_t* x,
faiss::IndexBinary::idx_t k,
int32_t* distances,
faiss::IndexBinary::idx_t* labels) const {
if (n == 0) {
return;
}
FAISS_THROW_IF_NOT(k > 0);
// For now, only support <= max int results
FAISS_THROW_IF_NOT_FMT(
n <= (Index::idx_t)std::numeric_limits<int>::max(),
"GPU index only supports up to %zu indices",
(size_t)std::numeric_limits<int>::max());
FAISS_THROW_IF_NOT_FMT(
k <= (Index::idx_t)getMaxKSelection(),
"GPU only supports k <= %d (requested %d)",
getMaxKSelection(),
(int)k); // select limitation
DeviceScope scope(binaryFlatConfig_.device);
auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
// The input vectors may be too large for the GPU, but we still
// assume that the output distances and labels are not.
// Go ahead and make space for output distances and labels on the
// GPU.
// If we reach a point where all inputs are too big, we can add
// another level of tiling.
auto outDistances = toDeviceTemporary<int32_t, 2>(
resources_.get(),
binaryFlatConfig_.device,
distances,
stream,
{(int)n, (int)k});
// FlatIndex only supports an interface returning int indices
DeviceTensor<int, 2, true> outIntIndices(
resources_.get(),
makeTempAlloc(AllocType::Other, stream),
{(int)n, (int)k});
bool usePaged = false;
if (getDeviceForAddress(x) == -1) {
// It is possible that the user is querying for a vector set size
// `x` that won't fit on the GPU.
// In this case, we will have to handle paging of the data from CPU
// -> GPU.
// Currently, we don't handle the case where the output data won't
// fit on the GPU (e.g., n * k is too large for the GPU memory).
size_t dataSize = (size_t)n * (this->d / 8) * sizeof(uint8_t);
if (dataSize >= kMinPageSize) {
searchFromCpuPaged_(
n, x, k, outDistances.data(), outIntIndices.data());
usePaged = true;
}
}
if (!usePaged) {
searchNonPaged_(n, x, k, outDistances.data(), outIntIndices.data());
}
// Convert and copy int indices out
auto outIndices = toDeviceTemporary<Index::idx_t, 2>(
resources_.get(),
binaryFlatConfig_.device,
labels,
stream,
{(int)n, (int)k});
// Convert int to idx_t
convertTensor<int, Index::idx_t, 2>(stream, outIntIndices, outIndices);
// Copy back if necessary
fromDevice<int32_t, 2>(outDistances, distances, stream);
fromDevice<Index::idx_t, 2>(outIndices, labels, stream);
}
void GpuIndexBinaryFlat::searchNonPaged_(
int n,
const uint8_t* x,
int k,
int32_t* outDistancesData,
int* outIndicesData) const {
Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
// Make sure arguments are on the device we desire; use temporary
// memory allocations to move it if necessary
auto vecs = toDeviceTemporary<uint8_t, 2>(
resources_.get(),
binaryFlatConfig_.device,
const_cast<uint8_t*>(x),
stream,
{n, (int)(this->d / 8)});
data_->query(vecs, k, outDistances, outIndices);
}
void GpuIndexBinaryFlat::searchFromCpuPaged_(
int n,
const uint8_t* x,
int k,
int32_t* outDistancesData,
int* outIndicesData) const {
Tensor<int32_t, 2, true> outDistances(outDistancesData, {n, k});
Tensor<int, 2, true> outIndices(outIndicesData, {n, k});
auto vectorSize = sizeof(uint8_t) * (this->d / 8);
// Just page without overlapping copy with compute (as GpuIndexFlat does)
int batchSize = utils::nextHighestPowerOf2(
(int)((size_t)kMinPageSize / vectorSize));
for (int cur = 0; cur < n; cur += batchSize) {
int num = std::min(batchSize, n - cur);
auto outDistancesSlice = outDistances.narrowOutermost(cur, num);
auto outIndicesSlice = outIndices.narrowOutermost(cur, num);
searchNonPaged_(
num,
x + (size_t)cur * (this->d / 8),
k,
outDistancesSlice.data(),
outIndicesSlice.data());
}
}
void GpuIndexBinaryFlat::reconstruct(
faiss::IndexBinary::idx_t key,
uint8_t* out) const {
DeviceScope scope(binaryFlatConfig_.device);
FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds");
auto stream = resources_->getDefaultStream(binaryFlatConfig_.device);
auto& vecs = data_->getVectorsRef();
auto vec = vecs[key];
fromDevice(vec.data(), out, vecs.getSize(1), stream);
}
} // namespace gpu
} // namespace faiss
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment