diff --git a/demos/CMakeLists.txt b/demos/CMakeLists.txt new file mode 100644 index 0000000..6b4141d --- /dev/null +++ b/demos/CMakeLists.txt @@ -0,0 +1,108 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +add_executable(demo_imi_flat EXCLUDE_FROM_ALL demo_imi_flat.cpp) +target_link_libraries(demo_imi_flat PRIVATE faiss) + +add_executable(demo_imi_pq EXCLUDE_FROM_ALL demo_imi_pq.cpp) +target_link_libraries(demo_imi_pq PRIVATE faiss) + +add_executable(demo_ivfpq_indexing EXCLUDE_FROM_ALL demo_ivfpq_indexing.cpp) +target_link_libraries(demo_ivfpq_indexing PRIVATE faiss) + +add_executable(demo_nndescent EXCLUDE_FROM_ALL demo_nndescent.cpp) +target_link_libraries(demo_nndescent PRIVATE faiss) + +add_executable(demo_sift1M EXCLUDE_FROM_ALL demo_sift1M.cpp) +target_link_libraries(demo_sift1M PRIVATE faiss) + +add_executable(demo_weighted_kmeans EXCLUDE_FROM_ALL demo_weighted_kmeans.cpp) +target_link_libraries(demo_weighted_kmeans PRIVATE faiss) + +add_executable(demo_residual_quantizer EXCLUDE_FROM_ALL demo_residual_quantizer.cpp) +target_link_libraries(demo_residual_quantizer PRIVATE faiss) + +# add_executable(demo_new_test EXCLUDE_FROM_ALL demo_new_test.cpp) +# target_link_libraries(demo_new_test PRIVATE faiss) + +# add_executable(demo_test_search EXCLUDE_FROM_ALL demo_test_search.cpp) +# target_link_libraries(demo_test_search PRIVATE faiss) + +# add_executable(demo_test_search_small EXCLUDE_FROM_ALL demo_test_search_small.cpp) +# target_link_libraries(demo_test_search_small PRIVATE faiss) + +# add_executable(demo_test_hybrid_small EXCLUDE_FROM_ALL demo_test_hybrid_small.cpp) +# target_link_libraries(demo_test_hybrid_small PRIVATE faiss) + +# add_executable(demo_test_hybrid_large EXCLUDE_FROM_ALL demo_test_hybrid_large.cpp) +# target_link_libraries(demo_test_hybrid_large PRIVATE faiss) + +# add_executable(make_indices EXCLUDE_FROM_ALL make_indices.cpp) +# target_link_libraries(make_indices PRIVATE faiss) + +# add_executable(make_sift_indices EXCLUDE_FROM_ALL make_sift_indices.cpp) +# target_link_libraries(make_sift_indices PRIVATE faiss) + +# add_executable(benchmark EXCLUDE_FROM_ALL benchmark.cpp) +# target_link_libraries(benchmark PRIVATE faiss) + +add_executable(utils EXCLUDE_FROM_ALL utils.cpp) +target_link_libraries(utils PRIVATE faiss) + +# add_executable(correlation EXCLUDE_FROM_ALL correlation.cpp) +# target_link_libraries(correlation PRIVATE faiss) + + +# add_executable(check_queries EXCLUDE_FROM_ALL check_queries.cpp) +# target_link_libraries(check_queries PRIVATE faiss) + +# add_executable(trace_query EXCLUDE_FROM_ALL trace_query.cpp) +# target_link_libraries(trace_query PRIVATE faiss) + +# add_executable(make_debug_index EXCLUDE_FROM_ALL make_debug_index.cpp) +# target_link_libraries(make_debug_index PRIVATE faiss) + +# add_executable(print_edges EXCLUDE_FROM_ALL print_edges.cpp) +# target_link_libraries(print_edges PRIVATE faiss) + +# add_executable(profile_query EXCLUDE_FROM_ALL profile_query.cpp) +# target_link_libraries(profile_query PRIVATE faiss) + +# add_executable(prefilter EXCLUDE_FROM_ALL prefilter.cpp) +# target_link_libraries(prefilter PRIVATE faiss) + +# add_executable(make_tripclick_indices EXCLUDE_FROM_ALL make_tripclick_indices.cpp) +# target_link_libraries(make_tripclick_indices PRIVATE faiss) + +# add_executable(test_tripclick_indices EXCLUDE_FROM_ALL test_tripclick_indices.cpp) +# target_link_libraries(test_tripclick_indices PRIVATE faiss) + + +# add_executable(trace_tripclick_query EXCLUDE_FROM_ALL trace_tripclick_query.cpp) +# target_link_libraries(trace_tripclick_query PRIVATE faiss) + +# add_executable(test_tripclick_dates EXCLUDE_FROM_ALL test_tripclick_dates.cpp) +# target_link_libraries(test_tripclick_dates PRIVATE faiss) + +# add_executable(make_tripclick_dates_indices EXCLUDE_FROM_ALL make_tripclick_dates_indices.cpp) +# target_link_libraries(make_tripclick_dates_indices PRIVATE faiss) + + +# add_executable(make_tripclick_oracle_indices EXCLUDE_FROM_ALL make_tripclick_oracle_indices.cpp) +# target_link_libraries(make_tripclick_oracle_indices PRIVATE faiss) + + +# add_executable(make_laion_indices EXCLUDE_FROM_ALL make_laion_indices.cpp) +# target_link_libraries(make_laion_indices PRIVATE faiss) + +# add_executable(test_laion_indices EXCLUDE_FROM_ALL test_laion_indices.cpp) +# target_link_libraries(test_laion_indices PRIVATE faiss) + +# add_executable(test_laion_arb_pred EXCLUDE_FROM_ALL test_laion_arb_pred.cpp) +# target_link_libraries(test_laion_arb_pred PRIVATE faiss) + +add_executable(test_acorn EXCLUDE_FROM_ALL test_acorn.cpp) +target_link_libraries(test_acorn PRIVATE faiss) diff --git a/demos/README.md b/demos/README.md new file mode 100644 index 0000000..71a23f2 --- /dev/null +++ b/demos/README.md @@ -0,0 +1,28 @@ + + +Demos for a few Faiss functionalities +===================================== + + +demo_auto_tune.py +----------------- + +Demonstrates the auto-tuning functionality of Faiss + + +demo_ondisk_ivf.py +------------------ + +Shows how to construct a Faiss index that stores the inverted file +data on disk, eg. when it does not fit in RAM. The script works on a +small dataset (sift1M) for demonstration and proceeds in stages: + +0: train on the dataset + +1-4: build 4 indexes, each containing 1/4 of the dataset. This can be +done in parallel on several machines + +5: merge the 4 indexes into one that is written directly to disk +(needs not to fit in RAM) + +6: load and test the index diff --git a/demos/demo_auto_tune.py b/demos/demo_auto_tune.py new file mode 100755 index 0000000..be1079a --- /dev/null +++ b/demos/demo_auto_tune.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python2 + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from __future__ import print_function +import os +import time +import numpy as np + +try: + import matplotlib + matplotlib.use('Agg') + from matplotlib import pyplot + graphical_output = True +except ImportError: + graphical_output = False + +import faiss + +################################################################# +# Small I/O functions +################################################################# + +def ivecs_read(fname): + a = np.fromfile(fname, dtype="int32") + d = a[0] + return a.reshape(-1, d + 1)[:, 1:].copy() + +def fvecs_read(fname): + return ivecs_read(fname).view('float32') + + +def plot_OperatingPoints(ops, nq, **kwargs): + ops = ops.optimal_pts + n = ops.size() * 2 - 1 + pyplot.plot([ops.at( i // 2).perf for i in range(n)], + [ops.at((i + 1) // 2).t / nq * 1000 for i in range(n)], + **kwargs) + + +################################################################# +# prepare common data for all indexes +################################################################# + + + +t0 = time.time() + +print("load data") + +xt = fvecs_read("sift1M/sift_learn.fvecs") +xb = fvecs_read("sift1M/sift_base.fvecs") +xq = fvecs_read("sift1M/sift_query.fvecs") + +d = xt.shape[1] + +print("load GT") + +gt = ivecs_read("sift1M/sift_groundtruth.ivecs") +gt = gt.astype('int64') +k = gt.shape[1] + +print("prepare criterion") + +# criterion = 1-recall at 1 +crit = faiss.OneRecallAtRCriterion(xq.shape[0], 1) +crit.set_groundtruth(None, gt) +crit.nnn = k + +# indexes that are useful when there is no limitation on memory usage +unlimited_mem_keys = [ + "IMI2x10,Flat", "IMI2x11,Flat", + "IVF4096,Flat", "IVF16384,Flat", + "PCA64,IMI2x10,Flat"] + +# memory limited to 16 bytes / vector +keys_mem_16 = [ + 'IMI2x10,PQ16', 'IVF4096,PQ16', + 'IMI2x10,PQ8+8', 'OPQ16_64,IMI2x10,PQ16' + ] + +# limited to 32 bytes / vector +keys_mem_32 = [ + 'IMI2x10,PQ32', 'IVF4096,PQ32', 'IVF16384,PQ32', + 'IMI2x10,PQ16+16', + 'OPQ32,IVF4096,PQ32', 'IVF4096,PQ16+16', 'OPQ16,IMI2x10,PQ16+16' + ] + +# indexes that can run on the GPU +keys_gpu = [ + "PCA64,IVF4096,Flat", + "PCA64,Flat", "Flat", "IVF4096,Flat", "IVF16384,Flat", + "IVF4096,PQ32"] + + +keys_to_test = unlimited_mem_keys +use_gpu = False + + +if use_gpu: + # if this fails, it means that the GPU version was not comp + assert faiss.StandardGpuResources, \ + "FAISS was not compiled with GPU support, or loading _swigfaiss_gpu.so failed" + res = faiss.StandardGpuResources() + dev_no = 0 + +# remember results from other index types +op_per_key = [] + + +# keep track of optimal operating points seen so far +op = faiss.OperatingPoints() + + +for index_key in keys_to_test: + + print("============ key", index_key) + + # make the index described by the key + index = faiss.index_factory(d, index_key) + + + if use_gpu: + # transfer to GPU (may be partial) + index = faiss.index_cpu_to_gpu(res, dev_no, index) + params = faiss.GpuParameterSpace() + else: + params = faiss.ParameterSpace() + + params.initialize(index) + + print("[%.3f s] train & add" % (time.time() - t0)) + + index.train(xt) + index.add(xb) + + print("[%.3f s] explore op points" % (time.time() - t0)) + + # find operating points for this index + opi = params.explore(index, xq, crit) + + print("[%.3f s] result operating points:" % (time.time() - t0)) + opi.display() + + # update best operating points so far + op.merge_with(opi, index_key + " ") + + op_per_key.append((index_key, opi)) + + if graphical_output: + # graphical output (to tmp/ subdirectory) + + fig = pyplot.figure(figsize=(12, 9)) + pyplot.xlabel("1-recall at 1") + pyplot.ylabel("search time (ms/query, %d threads)" % faiss.omp_get_max_threads()) + pyplot.gca().set_yscale('log') + pyplot.grid() + for i2, opi2 in op_per_key: + plot_OperatingPoints(opi2, crit.nq, label = i2, marker = 'o') + # plot_OperatingPoints(op, crit.nq, label = 'best', marker = 'o', color = 'r') + pyplot.legend(loc=2) + fig.savefig('tmp/demo_auto_tune.png') + + +print("[%.3f s] final result:" % (time.time() - t0)) + +op.display() diff --git a/demos/demo_client_server_ivf.py b/demos/demo_client_server_ivf.py new file mode 100755 index 0000000..82803d8 --- /dev/null +++ b/demos/demo_client_server_ivf.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import sys +import numpy as np +import faiss + +from faiss.contrib.client_server import run_index_server, ClientIndex + + +################################################################# +# Small I/O functions +################################################################# + + +def ivecs_read(fname): + a = np.fromfile(fname, dtype='int32') + d = a[0] + return a.reshape(-1, d + 1)[:, 1:].copy() + + +def fvecs_read(fname): + return ivecs_read(fname).view('float32') + + +################################################################# +# Main program +################################################################# + +stage = int(sys.argv[1]) + +tmpdir = '/tmp/' + +if stage == 0: + # train the index + xt = fvecs_read("sift1M/sift_learn.fvecs") + index = faiss.index_factory(xt.shape[1], "IVF4096,Flat") + print("training index") + index.train(xt) + print("write " + tmpdir + "trained.index") + faiss.write_index(index, tmpdir + "trained.index") + + +if 1 <= stage <= 4: + # add 1/4 of the database to 4 independent indexes + bno = stage - 1 + xb = fvecs_read("sift1M/sift_base.fvecs") + i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4) + index = faiss.read_index(tmpdir + "trained.index") + print("adding vectors %d:%d" % (i0, i1)) + index.add_with_ids(xb[i0:i1], np.arange(i0, i1)) + print("write " + tmpdir + "block_%d.index" % bno) + faiss.write_index(index, tmpdir + "block_%d.index" % bno) + + +machine_ports = [ + ('localhost', 12010), + ('localhost', 12011), + ('localhost', 12012), + ('localhost', 12013), +] +v6 = False + +if 5 <= stage <= 8: + # load an index slice and launch index + bno = stage - 5 + + fname = tmpdir + "block_%d.index" % bno + print("read " + fname) + index = faiss.read_index(fname) + + port = machine_ports[bno][1] + run_index_server(index, port, v6=v6) + + +if stage == 9: + client_index = ClientIndex(machine_ports) + print('index size:', client_index.ntotal) + client_index.set_nprobe(16) + + # load query vectors and ground-truth + xq = fvecs_read("sift1M/sift_query.fvecs") + gt = ivecs_read("sift1M/sift_groundtruth.ivecs") + + D, I = client_index.search(xq, 5) + + recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(xq.shape[0]) + print("recall@1: %.3f" % recall_at_1) diff --git a/demos/demo_imi_flat.cpp b/demos/demo_imi_flat.cpp new file mode 100644 index 0000000..7713da4 --- /dev/null +++ b/demos/demo_imi_flat.cpp @@ -0,0 +1,156 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +double elapsed() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return tv.tv_sec + tv.tv_usec * 1e-6; +} + +int main() { + double t0 = elapsed(); + + // dimension of the vectors to index + int d = 128; + + // size of the database we plan to index + size_t nb = 1000 * 1000; + + // make a set of nt training vectors in the unit cube + // (could be the database) + size_t nt = 100 * 1000; + + //--------------------------------------------------------------- + // Define the core quantizer + // We choose a multiple inverted index for faster training with less data + // and because it usually offers best accuracy/speed trade-offs + // + // We here assume that its lifespan of this coarse quantizer will cover the + // lifespan of the inverted-file quantizer IndexIVFFlat below + // With dynamic allocation, one may give the responsability to free the + // quantizer to the inverted-file index (with attribute do_delete_quantizer) + // + // Note: a regular clustering algorithm would be defined as: + // faiss::IndexFlatL2 coarse_quantizer (d); + // + // Use nhash=2 subquantizers used to define the product coarse quantizer + // Number of bits: we will have 2^nbits_coarse centroids per subquantizer + // meaning (2^12)^nhash distinct inverted lists + size_t nhash = 2; + size_t nbits_subq = int(log2(nb + 1) / 2); // good choice in general + size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids + + faiss::MultiIndexQuantizer coarse_quantizer(d, nhash, nbits_subq); + + printf("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)", + nhash, + nbits_subq, + ncentroids, + nb); + + // the coarse quantizer should not be dealloced before the index + // 4 = nb of bytes per code (d must be a multiple of this) + // 8 = nb of bits per sub-code (almost always 8) + faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT + faiss::IndexIVFFlat index(&coarse_quantizer, d, ncentroids, metric); + index.quantizer_trains_alone = true; + + // define the number of probes. 2048 is for high-dim, overkilled in practice + // Use 4-1024 depending on the trade-off speed accuracy that you want + index.nprobe = 2048; + + std::mt19937 rng; + std::uniform_real_distribution<> distrib; + + { // training + printf("[%.3f s] Generating %ld vectors in %dD for training\n", + elapsed() - t0, + nt, + d); + + std::vector trainvecs(nt * d); + for (size_t i = 0; i < nt * d; i++) { + trainvecs[i] = distrib(rng); + } + + printf("[%.3f s] Training the index\n", elapsed() - t0); + index.verbose = true; + index.train(nt, trainvecs.data()); + } + + size_t nq; + std::vector queries; + + { // populating the database + printf("[%.3f s] Building a dataset of %ld vectors to index\n", + elapsed() - t0, + nb); + + std::vector database(nb * d); + for (size_t i = 0; i < nb * d; i++) { + database[i] = distrib(rng); + } + + printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0); + + index.add(nb, database.data()); + + // remember a few elements from the database as queries + int i0 = 1234; + int i1 = 1244; + + nq = i1 - i0; + queries.resize(nq * d); + for (int i = i0; i < i1; i++) { + for (int j = 0; j < d; j++) { + queries[(i - i0) * d + j] = database[i * d + j]; + } + } + } + + { // searching the database + int k = 5; + printf("[%.3f s] Searching the %d nearest neighbors " + "of %ld vectors in the index\n", + elapsed() - t0, + k, + nq); + + std::vector nns(k * nq); + std::vector dis(k * nq); + + index.search(nq, queries.data(), k, dis.data(), nns.data()); + + printf("[%.3f s] Query results (vector ids, then distances):\n", + elapsed() - t0); + + for (int i = 0; i < nq; i++) { + printf("query %2d: ", i); + for (int j = 0; j < k; j++) { + printf("%7ld ", nns[j + i * k]); + } + printf("\n dis: "); + for (int j = 0; j < k; j++) { + printf("%7g ", dis[j + i * k]); + } + printf("\n"); + } + } + return 0; +} diff --git a/demos/demo_imi_pq.cpp b/demos/demo_imi_pq.cpp new file mode 100644 index 0000000..b20aefb --- /dev/null +++ b/demos/demo_imi_pq.cpp @@ -0,0 +1,208 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +double elapsed() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return tv.tv_sec + tv.tv_usec * 1e-6; +} + +int main() { + double t0 = elapsed(); + + // dimension of the vectors to index + int d = 64; + + // size of the database we plan to index + size_t nb = 1000 * 1000; + size_t add_bs = 10000; // # size of the blocks to add + + // make a set of nt training vectors in the unit cube + // (could be the database) + size_t nt = 100 * 1000; + + //--------------------------------------------------------------- + // Define the core quantizer + // We choose a multiple inverted index for faster training with less data + // and because it usually offers best accuracy/speed trade-offs + // + // We here assume that its lifespan of this coarse quantizer will cover the + // lifespan of the inverted-file quantizer IndexIVFFlat below + // With dynamic allocation, one may give the responsability to free the + // quantizer to the inverted-file index (with attribute do_delete_quantizer) + // + // Note: a regular clustering algorithm would be defined as: + // faiss::IndexFlatL2 coarse_quantizer (d); + // + // Use nhash=2 subquantizers used to define the product coarse quantizer + // Number of bits: we will have 2^nbits_coarse centroids per subquantizer + // meaning (2^12)^nhash distinct inverted lists + // + // The parameter bytes_per_code is determined by the memory + // constraint, the dataset will use nb * (bytes_per_code + 8) + // bytes. + // + // The parameter nbits_subq is determined by the size of the dataset to + // index. + // + size_t nhash = 2; + size_t nbits_subq = 9; + size_t ncentroids = 1 << (nhash * nbits_subq); // total # of centroids + int bytes_per_code = 16; + + faiss::MultiIndexQuantizer coarse_quantizer(d, nhash, nbits_subq); + + printf("IMI (%ld,%ld): %ld virtual centroids (target: %ld base vectors)", + nhash, + nbits_subq, + ncentroids, + nb); + + // the coarse quantizer should not be dealloced before the index + // 4 = nb of bytes per code (d must be a multiple of this) + // 8 = nb of bits per sub-code (almost always 8) + faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT + faiss::IndexIVFPQ index( + &coarse_quantizer, d, ncentroids, bytes_per_code, 8); + index.quantizer_trains_alone = true; + + // define the number of probes. 2048 is for high-dim, overkill in practice + // Use 4-1024 depending on the trade-off speed accuracy that you want + index.nprobe = 2048; + + std::mt19937 rng; + std::uniform_real_distribution<> distrib; + + { // training. + + // The distribution of the training vectors should be the same + // as the database vectors. It could be a sub-sample of the + // database vectors, if sampling is not biased. Here we just + // randomly generate the vectors. + + printf("[%.3f s] Generating %ld vectors in %dD for training\n", + elapsed() - t0, + nt, + d); + + std::vector trainvecs(nt * d); + for (size_t i = 0; i < nt; i++) { + for (size_t j = 0; j < d; j++) { + trainvecs[i * d + j] = distrib(rng); + } + } + + printf("[%.3f s] Training the index\n", elapsed() - t0); + index.verbose = true; + index.train(nt, trainvecs.data()); + } + + // the index can be re-loaded later with + // faiss::Index * idx = faiss::read_index("/tmp/trained_index.faissindex"); + faiss::write_index(&index, "/tmp/trained_index.faissindex"); + + size_t nq; + std::vector queries; + + { // populating the database + printf("[%.3f s] Building a dataset of %ld vectors to index\n", + elapsed() - t0, + nb); + + std::vector database(nb * d); + std::vector ids(nb); + for (size_t i = 0; i < nb; i++) { + for (size_t j = 0; j < d; j++) { + database[i * d + j] = distrib(rng); + } + ids[i] = 8760000000L + i; + } + + printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0); + + for (size_t begin = 0; begin < nb; begin += add_bs) { + size_t end = std::min(begin + add_bs, nb); + index.add_with_ids( + end - begin, + database.data() + d * begin, + ids.data() + begin); + } + + // remember a few elements from the database as queries + int i0 = 1234; + int i1 = 1244; + + nq = i1 - i0; + queries.resize(nq * d); + for (int i = i0; i < i1; i++) { + for (int j = 0; j < d; j++) { + queries[(i - i0) * d + j] = database[i * d + j]; + } + } + } + + // A few notes on the internal format of the index: + // + // - the positing lists for PQ codes are index.codes, which is a + // std::vector < std::vector > + // if n is the length of posting list #i, codes[i] has length + // bytes_per_code * n + // + // - the corresponding ids are stored in index.ids + // + // - given a vector float *x, finding which k centroids are + // closest to it (ie to find the nearest neighbors) can be done with + // + // faiss::idx_t *centroid_ids = new faiss::idx_t[k]; + // float *distances = new float[k]; + // index.quantizer->search (1, x, k, dis, centroids_ids); + // + + faiss::write_index(&index, "/tmp/populated_index.faissindex"); + + { // searching the database + int k = 5; + printf("[%.3f s] Searching the %d nearest neighbors " + "of %ld vectors in the index\n", + elapsed() - t0, + k, + nq); + + std::vector nns(k * nq); + std::vector dis(k * nq); + + index.search(nq, queries.data(), k, dis.data(), nns.data()); + + printf("[%.3f s] Query results (vector ids, then distances):\n", + elapsed() - t0); + + for (int i = 0; i < nq; i++) { + printf("query %2d: ", i); + for (int j = 0; j < k; j++) { + printf("%7ld ", nns[j + i * k]); + } + printf("\n dis: "); + for (int j = 0; j < k; j++) { + printf("%7g ", dis[j + i * k]); + } + printf("\n"); + } + } + return 0; +} diff --git a/demos/demo_ivfpq_indexing.cpp b/demos/demo_ivfpq_indexing.cpp new file mode 100644 index 0000000..7f3efbd --- /dev/null +++ b/demos/demo_ivfpq_indexing.cpp @@ -0,0 +1,139 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +double elapsed() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + tv.tv_usec * 1e-6; +} + +int main() { + double t0 = elapsed(); + + // dimension of the vectors to index + int d = 128; + + // size of thes database we plan to index + size_t nb = 200 * 1000; + + // make a set of nt training vectors in the unit cube + // (could be the database) + size_t nt = 100 * 1000; + + // make the index object and train it + faiss::IndexFlatL2 coarse_quantizer(d); + + // a reasonable number of centroids to index nb vectors + int ncentroids = int(4 * sqrt(nb)); + + // the coarse quantizer should not be dealloced before the index + // 4 = nb of bytes per code (d must be a multiple of this) + // 8 = nb of bits per sub-code (almost always 8) + faiss::IndexIVFPQ index(&coarse_quantizer, d, ncentroids, 4, 8); + + std::mt19937 rng; + + { // training + printf("[%.3f s] Generating %ld vectors in %dD for training\n", + elapsed() - t0, + nt, + d); + + std::vector trainvecs(nt * d); + std::uniform_real_distribution<> distrib; + for (size_t i = 0; i < nt * d; i++) { + trainvecs[i] = distrib(rng); + } + + printf("[%.3f s] Training the index\n", elapsed() - t0); + index.verbose = true; + + index.train(nt, trainvecs.data()); + } + + { // I/O demo + const char* outfilename = "/tmp/index_trained.faissindex"; + printf("[%.3f s] storing the pre-trained index to %s\n", + elapsed() - t0, + outfilename); + + write_index(&index, outfilename); + } + + size_t nq; + std::vector queries; + + { // populating the database + printf("[%.3f s] Building a dataset of %ld vectors to index\n", + elapsed() - t0, + nb); + + std::vector database(nb * d); + std::uniform_real_distribution<> distrib; + for (size_t i = 0; i < nb * d; i++) { + database[i] = distrib(rng); + } + + printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0); + + index.add(nb, database.data()); + + printf("[%.3f s] imbalance factor: %g\n", + elapsed() - t0, + index.invlists->imbalance_factor()); + + // remember a few elements from the database as queries + int i0 = 1234; + int i1 = 1243; + + nq = i1 - i0; + queries.resize(nq * d); + for (int i = i0; i < i1; i++) { + for (int j = 0; j < d; j++) { + queries[(i - i0) * d + j] = database[i * d + j]; + } + } + } + + { // searching the database + int k = 5; + printf("[%.3f s] Searching the %d nearest neighbors " + "of %ld vectors in the index\n", + elapsed() - t0, + k, + nq); + + std::vector nns(k * nq); + std::vector dis(k * nq); + + index.search(nq, queries.data(), k, dis.data(), nns.data()); + + printf("[%.3f s] Query results (vector ids, then distances):\n", + elapsed() - t0); + + for (int i = 0; i < nq; i++) { + printf("query %2d: ", i); + for (int j = 0; j < k; j++) { + printf("%7ld ", nns[j + i * k]); + } + printf("\n dis: "); + for (int j = 0; j < k; j++) { + printf("%7g ", dis[j + i * k]); + } + printf("\n"); + } + + printf("note that the nearest neighbor is not at " + "distance 0 due to quantization errors\n"); + } + + return 0; +} diff --git a/demos/demo_new_test.cpp b/demos/demo_new_test.cpp new file mode 100644 index 0000000..2e66ab3 --- /dev/null +++ b/demos/demo_new_test.cpp @@ -0,0 +1,177 @@ +#include +#include +#include +#include + +#include + +#include +#include +#include + +// added these +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // for ostringstream +#include +#include +#include + + + +/******************************************************* + * Added for debugging + *******************************************************/ +const int debugFlag = 1; + +void debugTime() { + if (debugFlag) { + struct timeval tval; + gettimeofday(&tval, NULL); + struct tm *tm_info = localtime(&tval.tv_sec); + char timeBuff[25] = ""; + strftime(timeBuff, 25, "%H:%M:%S", tm_info); + char timeBuffWithMilli[50] = ""; + sprintf(timeBuffWithMilli, "%s.%06ld ", timeBuff, tval.tv_usec); + std::string timestamp(timeBuffWithMilli); + std::cout << timestamp << std::flush; + } +} + +//needs atleast 2 args always +// alt debugFlag = 1 // fprintf(stderr, fmt, __VA_ARGS__); +#define debug(fmt, ...) \ + do { \ + if (debugFlag == 1) { \ + fprintf(stdout, "--" fmt, __VA_ARGS__);\ + } \ + if (debugFlag == 2) { \ + debugTime(); \ + fprintf(stdout, "%s:%d:%s(): " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); \ + } \ + } while (0) + + + +double elapsed() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + tv.tv_usec * 1e-6; +} + +/******************************************************* + * Run tests + *******************************************************/ + +// args are nb, M, gamma +int main(int argc, char *argv[]) { + printf("====================\nSTART: running tests for hnsw...\n"); + double t0 = elapsed(); + int opt; + int d = 128; // dimension of the vectors to index + size_t nb; + int M; + int gamma; + // int d = 128; // dimension of the vectors to index + // int M = 32 * 1000; // HSNW param M + // size_t nb = 1000; // size of the database we plan to index + + + {// parse arguments + + if (argc != 4) { + fprintf(stderr, "Syntax: %s \n", argv[0]); + exit(1); + } + + nb = strtoul(argv[1], NULL, 10); + debug("nb: %ld\n", nb); + + M = atoi(argv[2]); + debug("M: %d\n", M); + + gamma = atoi(argv[3]); + debug("gamma: %d\n", gamma); + } + + printf("[%.3f s] Index Params -- d: %d, M: %d, nb: %ld, gamma: %d\n", + elapsed() - t0, d, M, nb, gamma); + faiss::IndexHNSWFlat index(d, M, gamma); + debug("HNSW index created%s\n", ""); + + std::mt19937 rng; // random generator to be used for creating vectors + + size_t nq; // num queries + std::vector queries; + + { // populating the database + printf("[%.3f s] Building a dataset of %ld vectors to index\n", + elapsed() - t0, + nb); + + std::vector database(nb * d); + std::uniform_real_distribution<> distrib; + for (size_t i = 0; i < nb * d; i++) { + database[i] = distrib(rng); + } + + printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0); + + index.add(nb, database.data()); + + printf("[%.3f s] Vectors added\n", elapsed() - t0); + + // TODO: print out stats here + // printf("[%.3f s] imbalance factor: %g\n", + // elapsed() - t0, + // index.invlists->imbalance_factor()); + + // remember a few elements from the database as queries + int i0 = 4; + int i1 = 8; + + nq = i1 - i0; + queries.resize(nq * d); + for (int i = i0; i < i1; i++) { + for (int j = 0; j < d; j++) { + queries[(i - i0) * d + j] = database[i * d + j]; + } + } + } + + { // print out stats + index.printStats(); + } + + { // get index size + + // file name + std::ostringstream ss; + ss << "./tmp/index_hnsw_N=" << nb << ".faissindex"; + std::string s_tmp = ss.str(); + const char* outfilename = s_tmp.c_str(); + // const char* outfilename = "/tmp/index_hnsw.faissindex"; + printf("[%.3f s] storing the hnsw index to %s\n", + elapsed() - t0, + outfilename); + + // write index to disk + write_index(&index, outfilename); + + // measure file size + std::ifstream in_file(outfilename, std::ios::binary); + in_file.seekg(0, std::ios::end); + int file_size = in_file.tellg(); + std::cout<<"====Size of the file is"<<" "<< file_size<<" "<<"bytes" << std::endl; + + } + + printf("-----DONE-----\n"); +} \ No newline at end of file diff --git a/demos/demo_nndescent.cpp b/demos/demo_nndescent.cpp new file mode 100644 index 0000000..34594b0 --- /dev/null +++ b/demos/demo_nndescent.cpp @@ -0,0 +1,88 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include +#include + +using namespace std::chrono; + +int main(void) { + // dimension of the vectors to index + int d = 64; + int K = 64; + + // size of the database we plan to index + size_t nb = 10000; + + std::mt19937 rng(12345); + + // make the index object and train it + faiss::IndexNNDescentFlat index(d, K, faiss::METRIC_L2); + index.nndescent.S = 10; + index.nndescent.R = 32; + index.nndescent.L = K; + index.nndescent.iter = 10; + index.verbose = true; + + // generate labels by IndexFlat + faiss::IndexFlat bruteforce(d, faiss::METRIC_L2); + + std::vector database(nb * d); + for (size_t i = 0; i < nb * d; i++) { + database[i] = rng() % 1024; + } + + { // populating the database + index.add(nb, database.data()); + bruteforce.add(nb, database.data()); + } + + size_t nq = 1000; + + { // searching the database + printf("Searching ...\n"); + index.nndescent.search_L = 50; + + std::vector queries(nq * d); + for (size_t i = 0; i < nq * d; i++) { + queries[i] = rng() % 1024; + } + + int k = 5; + std::vector nns(k * nq); + std::vector gt_nns(k * nq); + std::vector dis(k * nq); + + auto start = high_resolution_clock::now(); + index.search(nq, queries.data(), k, dis.data(), nns.data()); + auto end = high_resolution_clock::now(); + + // find exact kNNs by brute force search + bruteforce.search(nq, queries.data(), k, dis.data(), gt_nns.data()); + + int recalls = 0; + for (size_t i = 0; i < nq; ++i) { + for (int n = 0; n < k; n++) { + for (int m = 0; m < k; m++) { + if (nns[i * k + n] == gt_nns[i * k + m]) { + recalls += 1; + } + } + } + } + float recall = 1.0f * recalls / (k * nq); + auto t = duration_cast(end - start).count(); + int qps = nq * 1.0f * 1000 * 1000 / t; + + printf("Recall@%d: %f, QPS: %d\n", k, recall, qps); + } +} diff --git a/demos/demo_ondisk_ivf.py b/demos/demo_ondisk_ivf.py new file mode 100755 index 0000000..e4d6437 --- /dev/null +++ b/demos/demo_ondisk_ivf.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import sys +import numpy as np +import faiss +from faiss.contrib.ondisk import merge_ondisk + +################################################################# +# Small I/O functions +################################################################# + + +def ivecs_read(fname): + a = np.fromfile(fname, dtype='int32') + d = a[0] + return a.reshape(-1, d + 1)[:, 1:].copy() + + +def fvecs_read(fname): + return ivecs_read(fname).view('float32') + + +################################################################# +# Main program +################################################################# + +stage = int(sys.argv[1]) + +tmpdir = '/tmp/' + +if stage == 0: + # train the index + xt = fvecs_read("sift1M/sift_learn.fvecs") + index = faiss.index_factory(xt.shape[1], "IVF4096,Flat") + print("training index") + index.train(xt) + print("write " + tmpdir + "trained.index") + faiss.write_index(index, tmpdir + "trained.index") + + +if 1 <= stage <= 4: + # add 1/4 of the database to 4 independent indexes + bno = stage - 1 + xb = fvecs_read("sift1M/sift_base.fvecs") + i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4) + index = faiss.read_index(tmpdir + "trained.index") + print("adding vectors %d:%d" % (i0, i1)) + index.add_with_ids(xb[i0:i1], np.arange(i0, i1)) + print("write " + tmpdir + "block_%d.index" % bno) + faiss.write_index(index, tmpdir + "block_%d.index" % bno) + +if stage == 5: + + print('loading trained index') + # construct the output index + index = faiss.read_index(tmpdir + "trained.index") + + block_fnames = [ + tmpdir + "block_%d.index" % bno + for bno in range(4) + ] + + merge_ondisk(index, block_fnames, tmpdir + "merged_index.ivfdata") + + print("write " + tmpdir + "populated.index") + faiss.write_index(index, tmpdir + "populated.index") + + +if stage == 6: + # perform a search from disk + print("read " + tmpdir + "populated.index") + index = faiss.read_index(tmpdir + "populated.index") + index.nprobe = 16 + + # load query vectors and ground-truth + xq = fvecs_read("sift1M/sift_query.fvecs") + gt = ivecs_read("sift1M/sift_groundtruth.ivecs") + + D, I = index.search(xq, 5) + + recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(xq.shape[0]) + print("recall@1: %.3f" % recall_at_1) diff --git a/demos/demo_residual_quantizer.cpp b/demos/demo_residual_quantizer.cpp new file mode 100644 index 0000000..6166fc1 --- /dev/null +++ b/demos/demo_residual_quantizer.cpp @@ -0,0 +1,292 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* This demo file shows how to: + * - use a DistanceComputer to compute distances with encoded vectors + * - in the context of an IVF, how to split an additive quantizer into an + * AdditiveCoarseQuantizer and a ResidualQuantizer, in two different ways, with + * and without storing the prefix. + */ + +int main() { + /****************************************** + * Generate a test dataset + ******************************************/ + using idx_t = faiss::idx_t; + size_t d = 128; + size_t nt = 10000; + size_t nb = 10000; + size_t nq = 100; + double t0 = faiss::getmillisecs(); + + auto tic = [t0]() { + printf("[%.3f s] ", (faiss::getmillisecs() - t0) / 1000); + }; + + tic(); + printf("samping dataset of %zd dim vectors, Q %zd B %zd T %zd\n", + d, + nq, + nb, + nt); + + std::vector buf(d * (nq + nt + nb)); + faiss::rand_smooth_vectors(nq + nt + nb, d, buf.data(), 1234); + const float* xt = buf.data(); + const float* xb = buf.data() + nt * d; + const float* xq = buf.data() + (nt + nb) * d; + + idx_t k = 10; + std::vector gt(k * nq); + std::vector unused(k * nq); + tic(); + printf("compute ground truth, k=%zd\n", k); + faiss::knn_L2sqr(xq, xb, d, nq, nb, k, unused.data(), gt.data()); + + // a function to compute the accuracy + auto accuracy = [&](const idx_t* I) { + idx_t accu = 0; + for (idx_t q = 0; q < nq; q++) { + accu += faiss::ranklist_intersection_size( + k, gt.data() + q * k, k, I + q * k); + } + return double(accu) / (k * nq); + }; + + /****************************************** + * Prepare the residual quantizer + ******************************************/ + + faiss::ResidualQuantizer rq( + d, 7, 6, faiss::AdditiveQuantizer::ST_norm_qint8); + // do cheap an inaccurate training + rq.cp.niter = 5; + rq.max_beam_size = 5; + rq.train_type = 0; + tic(); + printf("training the residual quantizer beam_size=%d\n", rq.max_beam_size); + rq.train(nt, xt); + + tic(); + printf("encoding the database, code_size=%zd\n", rq.code_size); + size_t code_size = rq.code_size; + std::vector raw_codes(nb * code_size); + rq.compute_codes(xb, raw_codes.data(), nb); + + /**************************************************************** + * Make an index that uses that residual quantizer + * Verify that a distance computer gives the same distances + ****************************************************************/ + { + faiss::IndexResidualQuantizer index( + rq.d, rq.nbits, faiss::METRIC_L2, rq.search_type); + + // override trained index + index.rq = rq; + index.is_trained = true; + + // override vectors + index.codes = raw_codes; + index.ntotal = nb; + + tic(); + printf("IndexResidualQuantizer ready, searching\n"); + + std::vector D(k * nq); + std::vector I(k * nq); + index.search(nq, xq, k, D.data(), I.data()); + + tic(); + printf("Accuracy (intersection @ %zd): %.3f\n", k, accuracy(I.data())); + std::unique_ptr dc( + index.get_FlatCodesDistanceComputer()); + + float max_diff12 = 0, max_diff13 = 0; + + for (idx_t q = 0; q < nq; q++) { + const float* query = xq + q * d; + dc->set_query(query); + for (int i = 0; i < k; i++) { + // 3 ways of computing the same distance + + // distance returned by the index + float dis1 = D[q * k + i]; + + // distance returned by the DistanceComputer that accesses the + // index + idx_t db_index = I[q * k + i]; + float dis2 = (*dc)(db_index); + + // distance computer from a code that does not belong to the + // index + const uint8_t* code = raw_codes.data() + code_size * db_index; + float dis3 = dc->distance_to_code(code); + + max_diff12 = std::max(std::abs(dis1 - dis2), max_diff12); + max_diff13 = std::max(std::abs(dis1 - dis3), max_diff13); + } + } + tic(); + printf("Max DistanceComputer discrepancy 1-2: %g 1-3: %g\n", + max_diff12, + max_diff13); + } + + /**************************************************************** + * Make an IVF index that uses the first 2 levels as a coarse quantizer + * The IVF codes contain the full code (ie. redundant with the coarse + *quantizer code) + ****************************************************************/ + { + // build a coarse quantizer from the 2 first levels of the RQ + std::vector nbits(2); + std::copy(rq.nbits.begin(), rq.nbits.begin() + 2, nbits.begin()); + faiss::ResidualCoarseQuantizer rcq(rq.d, nbits); + + // set the coarse quantizer from the 2 first quantizers + rcq.rq.initialize_from(rq); + rcq.is_trained = true; + rcq.ntotal = (idx_t)1 << rcq.rq.tot_bits; + + // settings for exhaustive search in RCQ + rcq.centroid_norms.resize(rcq.ntotal); + rcq.aq->compute_centroid_norms(rcq.centroid_norms.data()); + rcq.beam_factor = -1.0; // use exact search + size_t nlist = rcq.ntotal; + tic(); + printf("RCQ nlist = %zd tot_bits=%zd\n", nlist, rcq.rq.tot_bits); + + // build a IVFResidualQuantizer from that + faiss::IndexIVFResidualQuantizer index( + &rcq, rcq.d, nlist, rq.nbits, faiss::METRIC_L2, rq.search_type); + index.by_residual = false; + index.rq = rq; + index.is_trained = true; + + // there are 3 ways of filling up the index... + for (std::string filled_with : {"add", "manual", "derived"}) { + tic(); + printf("filling up the index with %s, code_size=%zd\n", + filled_with.c_str(), + index.code_size); + + index.reset(); + + if (filled_with == "add") { + // standard add method + index.add(nb, xb); + } else if (filled_with == "manual") { + // compute inverted lists and add elements manually + // fill in the inverted index manually + faiss::InvertedLists& invlists = *index.invlists; + + // assign vectors to inverted lists + std::vector listnos(nb); + std::vector unused(nb); + rcq.search(nb, xb, 1, unused.data(), listnos.data()); + + // populate inverted lists + for (idx_t i = 0; i < nb; i++) { + invlists.add_entry( + listnos[i], i, &raw_codes[i * code_size]); + } + + index.ntotal = nb; + } else if (filled_with == "derived") { + // Since we have the raw codes precomputed, their prefix is the + // inverted list index, so let's use that. + faiss::InvertedLists& invlists = *index.invlists; + + // populate inverted lists + for (idx_t i = 0; i < nb; i++) { + const uint8_t* code = &raw_codes[i * code_size]; + faiss::BitstringReader rd(code, code_size); + idx_t list_no = + rd.read(rcq.rq.tot_bits); // read the list number + invlists.add_entry(list_no, i, code); + } + + index.ntotal = nb; + } + + tic(); + printf("Index filled in\n"); + + for (int nprobe : {1, 4, 16, 64, int(nlist)}) { + printf("setting nprobe=%-4d", nprobe); + + index.nprobe = nprobe; + std::vector D(k * nq); + std::vector I(k * nq); + index.search(nq, xq, k, D.data(), I.data()); + + tic(); + printf("Accuracy (intersection @ %zd): %.3f\n", + k, + accuracy(I.data())); + } + } + } + + /**************************************************************** + * Make an IVF index that uses the first 2 levels as a coarse + * quantizer, but this time does not store the code prefix from the index + ****************************************************************/ + + { + // build a coarse quantizer from the 2 first levels of the RQ + int nlevel = 2; + + std::unique_ptr index( + faiss::ivflib::ivf_residual_from_quantizer(rq, nlevel)); + + // there are 2 ways of filling up the index... + for (std::string filled_with : {"add", "derived"}) { + tic(); + printf("filling up the IVF index with %s, code_size=%zd\n", + filled_with.c_str(), + index->code_size); + + index->reset(); + + if (filled_with == "add") { + // standard add method + index->add(nb, xb); + } else if (filled_with == "derived") { + faiss::ivflib::ivf_residual_add_from_flat_codes( + index.get(), nb, raw_codes.data(), rq.code_size); + } + + tic(); + printf("Index filled in\n"); + + for (int nprobe : {1, 4, 16, 64, int(index->nlist)}) { + printf("setting nprobe=%-4d", nprobe); + + index->nprobe = nprobe; + std::vector D(k * nq); + std::vector I(k * nq); + index->search(nq, xq, k, D.data(), I.data()); + + tic(); + printf("Accuracy (intersection @ %zd): %.3f\n", + k, + accuracy(I.data())); + } + } + } + + return 0; +} diff --git a/demos/demo_sift1M.cpp b/demos/demo_sift1M.cpp new file mode 100644 index 0000000..598565f --- /dev/null +++ b/demos/demo_sift1M.cpp @@ -0,0 +1,256 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +/** + * To run this demo, please download the ANN_SIFT1M dataset from + * + * http://corpus-texmex.irisa.fr/ + * + * and unzip it to the sudirectory sift1M. + **/ + +/***************************************************** + * I/O functions for fvecs and ivecs + *****************************************************/ + +float* fvecs_read(const char* fname, size_t* d_out, size_t* n_out) { + FILE* f = fopen(fname, "r"); + if (!f) { + fprintf(stderr, "could not open %s\n", fname); + perror(""); + abort(); + } + int d; + fread(&d, 1, sizeof(int), f); + assert((d > 0 && d < 1000000) || !"unreasonable dimension"); + fseek(f, 0, SEEK_SET); + struct stat st; + fstat(fileno(f), &st); + size_t sz = st.st_size; + assert(sz % ((d + 1) * 4) == 0 || !"weird file size"); + size_t n = sz / ((d + 1) * 4); + + *d_out = d; + *n_out = n; + float* x = new float[n * (d + 1)]; + size_t nr = fread(x, sizeof(float), n * (d + 1), f); + assert(nr == n * (d + 1) || !"could not read whole file"); + + // shift array to remove row headers + for (size_t i = 0; i < n; i++) + memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x)); + + fclose(f); + return x; +} + +// not very clean, but works as long as sizeof(int) == sizeof(float) +int* ivecs_read(const char* fname, size_t* d_out, size_t* n_out) { + return (int*)fvecs_read(fname, d_out, n_out); +} + +double elapsed() { + struct timeval tv; + gettimeofday(&tv, nullptr); + return tv.tv_sec + tv.tv_usec * 1e-6; +} + +int main() { + double t0 = elapsed(); + + // this is typically the fastest one. + const char* index_key = "IVF4096,Flat"; + + // these ones have better memory usage + // const char *index_key = "Flat"; + // const char *index_key = "PQ32"; + // const char *index_key = "PCA80,Flat"; + // const char *index_key = "IVF4096,PQ8+16"; + // const char *index_key = "IVF4096,PQ32"; + // const char *index_key = "IMI2x8,PQ32"; + // const char *index_key = "IMI2x8,PQ8+16"; + // const char *index_key = "OPQ16_64,IMI2x8,PQ8+16"; + + faiss::Index* index; + + size_t d; + + { + printf("[%.3f s] Loading train set\n", elapsed() - t0); + + size_t nt; + float* xt = fvecs_read("sift1M/sift_learn.fvecs", &d, &nt); + + printf("[%.3f s] Preparing index \"%s\" d=%ld\n", + elapsed() - t0, + index_key, + d); + index = faiss::index_factory(d, index_key); + + printf("[%.3f s] Training on %ld vectors\n", elapsed() - t0, nt); + + index->train(nt, xt); + delete[] xt; + } + + { + printf("[%.3f s] Loading database\n", elapsed() - t0); + + size_t nb, d2; + float* xb = fvecs_read("sift1M/sift_base.fvecs", &d2, &nb); + assert(d == d2 || !"dataset does not have same dimension as train set"); + + printf("[%.3f s] Indexing database, size %ld*%ld\n", + elapsed() - t0, + nb, + d); + + index->add(nb, xb); + + delete[] xb; + } + + size_t nq; + float* xq; + + { + printf("[%.3f s] Loading queries\n", elapsed() - t0); + + size_t d2; + xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq); + assert(d == d2 || !"query does not have same dimension as train set"); + } + + size_t k; // nb of results per query in the GT + faiss::idx_t* gt; // nq * k matrix of ground-truth nearest-neighbors + + { + printf("[%.3f s] Loading ground truth for %ld queries\n", + elapsed() - t0, + nq); + + // load ground-truth and convert int to long + size_t nq2; + int* gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2); + assert(nq2 == nq || !"incorrect nb of ground truth entries"); + + gt = new faiss::idx_t[k * nq]; + for (int i = 0; i < k * nq; i++) { + gt[i] = gt_int[i]; + } + delete[] gt_int; + } + + // Result of the auto-tuning + std::string selected_params; + + { // run auto-tuning + + printf("[%.3f s] Preparing auto-tune criterion 1-recall at 1 " + "criterion, with k=%ld nq=%ld\n", + elapsed() - t0, + k, + nq); + + faiss::OneRecallAtRCriterion crit(nq, 1); + crit.set_groundtruth(k, nullptr, gt); + crit.nnn = k; // by default, the criterion will request only 1 NN + + printf("[%.3f s] Preparing auto-tune parameters\n", elapsed() - t0); + + faiss::ParameterSpace params; + params.initialize(index); + + printf("[%.3f s] Auto-tuning over %ld parameters (%ld combinations)\n", + elapsed() - t0, + params.parameter_ranges.size(), + params.n_combinations()); + + faiss::OperatingPoints ops; + params.explore(index, nq, xq, crit, &ops); + + printf("[%.3f s] Found the following operating points: \n", + elapsed() - t0); + + ops.display(); + + // keep the first parameter that obtains > 0.5 1-recall@1 + for (int i = 0; i < ops.optimal_pts.size(); i++) { + if (ops.optimal_pts[i].perf > 0.5) { + selected_params = ops.optimal_pts[i].key; + break; + } + } + assert(selected_params.size() >= 0 || + !"could not find good enough op point"); + } + + { // Use the found configuration to perform a search + + faiss::ParameterSpace params; + + printf("[%.3f s] Setting parameter configuration \"%s\" on index\n", + elapsed() - t0, + selected_params.c_str()); + + params.set_index_parameters(index, selected_params.c_str()); + + printf("[%.3f s] Perform a search on %ld queries\n", + elapsed() - t0, + nq); + + // output buffers + faiss::idx_t* I = new faiss::idx_t[nq * k]; + float* D = new float[nq * k]; + + index->search(nq, xq, k, D, I); + + printf("[%.3f s] Compute recalls\n", elapsed() - t0); + + // evaluate result by hand. + int n_1 = 0, n_10 = 0, n_100 = 0; + for (int i = 0; i < nq; i++) { + int gt_nn = gt[i * k]; + for (int j = 0; j < k; j++) { + if (I[i * k + j] == gt_nn) { + if (j < 1) + n_1++; + if (j < 10) + n_10++; + if (j < 100) + n_100++; + } + } + } + printf("R@1 = %.4f\n", n_1 / float(nq)); + printf("R@10 = %.4f\n", n_10 / float(nq)); + printf("R@100 = %.4f\n", n_100 / float(nq)); + + delete[] I; + delete[] D; + } + + delete[] xq; + delete[] gt; + delete index; + return 0; +} diff --git a/demos/demo_weighted_kmeans.cpp b/demos/demo_weighted_kmeans.cpp new file mode 100644 index 0000000..f6f89fa --- /dev/null +++ b/demos/demo_weighted_kmeans.cpp @@ -0,0 +1,180 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include +#include +#include + +namespace { + +enum WeightedKMeansType { + WKMT_FlatL2, + WKMT_FlatIP, + WKMT_FlatIP_spherical, + WKMT_HNSW, +}; + +float weighted_kmeans_clustering( + size_t d, + size_t n, + size_t k, + const float* input, + const float* weights, + float* centroids, + WeightedKMeansType index_num) { + using namespace faiss; + Clustering clus(d, k); + clus.verbose = true; + + std::unique_ptr index; + + switch (index_num) { + case WKMT_FlatL2: + index.reset(new IndexFlatL2(d)); + break; + case WKMT_FlatIP: + index.reset(new IndexFlatIP(d)); + break; + case WKMT_FlatIP_spherical: + index.reset(new IndexFlatIP(d)); + clus.spherical = true; + break; + case WKMT_HNSW: + IndexHNSWFlat* ihnsw = new IndexHNSWFlat(d, 32); + ihnsw->hnsw.efSearch = 128; + index.reset(ihnsw); + break; + } + + clus.train(n, input, *index.get(), weights); + // on output the index contains the centroids. + memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k); + return clus.iteration_stats.back().obj; +} + +int d = 32; +float sigma = 0.1; + +#define BIGTEST + +#ifdef BIGTEST +// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs +int nc = 200000; +int n_big = 4; +int n_small = 2; +#else +int nc = 5; +int n_big = 100; +int n_small = 10; +#endif + +int n; // number of training points + +void generate_trainset( + std::vector& ccent, + std::vector& x, + std::vector& weights) { + // same sampling as test_build_blocks.py test_weighted + + ccent.resize(d * 2 * nc); + faiss::float_randn(ccent.data(), d * 2 * nc, 123); + faiss::fvec_renorm_L2(d, 2 * nc, ccent.data()); + n = nc * n_big + nc * n_small; + x.resize(d * n); + weights.resize(n); + faiss::float_randn(x.data(), x.size(), 1234); + + float* xi = x.data(); + float* w = weights.data(); + for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids + int np = ci < nc ? n_big : n_small; // nb of points around this centroid + for (int i = 0; i < np; i++) { + for (int j = 0; j < d; j++) { + xi[j] = xi[j] * sigma + ccent[ci * d + j]; + } + *w++ = ci < nc ? 0.1 : 10; + xi += d; + } + } +} + +} // namespace + +int main(int argc, char** argv) { + std::vector ccent; + std::vector x; + std::vector weights; + + printf("generate training set\n"); + generate_trainset(ccent, x, weights); + + std::vector centroids; + centroids.resize(nc * d); + + int the_index_num = -1; + int the_with_weights = -1; + + if (argc == 3) { + the_index_num = atoi(argv[1]); + the_with_weights = atoi(argv[2]); + } + + for (int index_num = WKMT_FlatL2; index_num <= WKMT_HNSW; index_num++) { + if (the_index_num >= 0 && index_num != the_index_num) { + continue; + } + + for (int with_weights = 0; with_weights <= 1; with_weights++) { + if (the_with_weights >= 0 && with_weights != the_with_weights) { + continue; + } + + printf("=================== index_num=%d Run %s weights\n", + index_num, + with_weights ? "with" : "without"); + + weighted_kmeans_clustering( + d, + n, + nc, + x.data(), + with_weights ? weights.data() : nullptr, + centroids.data(), + (WeightedKMeansType)index_num); + + { // compute distance of points to centroids + faiss::IndexFlatL2 cent_index(d); + cent_index.add(nc, centroids.data()); + std::vector dis(n); + std::vector idx(n); + + cent_index.search( + nc * 2, ccent.data(), 1, dis.data(), idx.data()); + + float dis1 = 0, dis2 = 0; + for (int i = 0; i < nc; i++) { + dis1 += dis[i]; + } + printf("average distance of points from big clusters: %g\n", + dis1 / nc); + + for (int i = 0; i < nc; i++) { + dis2 += dis[i + nc]; + } + + printf("average distance of points from small clusters: %g\n", + dis2 / nc); + } + } + } + return 0; +} diff --git a/demos/metadata.txt b/demos/metadata.txt new file mode 100644 index 0000000..9257b63 --- /dev/null +++ b/demos/metadata.txt @@ -0,0 +1,61 @@ +## 1000 rand values between 1 and 10 - for testing nb=1000, s =.1 + +7, 2, 2, 6, 6, 4, 7, 9, 6, 2, 8, 1, 9, 4, 9, 3, 8, + 1, 5, 4, 6, 3, 9, 5, 5, 2, 6, 10, 7, 4, 9, 10, 9, 8, + 9, 6, 3, 5, 9, 6, 9, 3, 10, 2, 10, 3, 5, 7, 7, 8, 10, + 1, 2, 2, 8, 3, 5, 7, 9, 8, 10, 9, 1, 4, 6, 2, 7, 1, + 9, 10, 10, 7, 1, 1, 5, 5, 8, 1, 6, 2, 1, 5, 1, 4, 9, + 2, 10, 7, 5, 1, 10, 2, 1, 4, 4, 8, 7, 2, 2, 10, 5, 9, + 2, 2, 2, 1, 6, 5, 2, 5, 2, 6, 5, 7, 7, 4, 5, 1, 9, + 4, 1, 1, 3, 1, 2, 7, 9, 2, 9, 7, 10, 8, 8, 6, 9, 7, + 5, 8, 8, 4, 9, 3, 3, 6, 8, 6, 9, 6, 9, 9, 5, 8, 6, + 3, 5, 2, 6, 4, 8, 1, 7, 8, 3, 5, 1, 4, 2, 7, 1, 9, + 10, 2, 7, 7, 3, 3, 3, 6, 4, 3, 4, 5, 10, 10, 6, 6, 9, + 8, 5, 9, 1, 1, 3, 5, 8, 4, 5, 3, 10, 2, 4, 7, 1, 3, + 6, 1, 1, 7, 5, 6, 6, 6, 3, 6, 4, 1, 9, 3, 2, 6, 3, + 1, 5, 5, 1, 8, 8, 1, 1, 9, 4, 9, 4, 4, 4, 7, 2, 2, + 4, 10, 5, 5, 5, 6, 1, 4, 2, 4, 8, 1, 1, 3, 8, 9, 2, + 2, 8, 2, 6, 4, 7, 8, 5, 8, 7, 7, 3, 6, 2, 3, 7, 10, + 9, 10, 6, 6, 2, 2, 10, 2, 6, 3, 8, 3, 10, 7, 1, 3, 7, + 7, 1, 4, 10, 10, 5, 7, 3, 4, 4, 3, 8, 3, 1, 1, 4, 2, + 4, 6, 6, 9, 1, 10, 3, 2, 5, 9, 9, 9, 5, 7, 3, 1, 3, + 9, 1, 9, 1, 3, 1, 1, 7, 10, 5, 8, 7, 7, 9, 1, 1, 7, + 8, 6, 9, 3, 2, 3, 9, 1, 2, 2, 10, 3, 10, 5, 8, 2, 8, + 6, 1, 10, 5, 1, 6, 1, 4, 2, 5, 10, 7, 8, 10, 2, 3, 2, + 6, 1, 7, 3, 6, 8, 7, 8, 9, 4, 5, 10, 1, 8, 7, 9, 8, + 5, 2, 5, 7, 9, 10, 2, 5, 7, 1, 3, 9, 1, 5, 1, 3, 3, + 7, 2, 5, 8, 10, 8, 10, 7, 5, 3, 1, 6, 4, 2, 10, 10, 6, + 4, 8, 6, 5, 2, 7, 5, 3, 1, 1, 8, 3, 4, 10, 8, 1, 5, + 4, 9, 2, 8, 6, 5, 4, 3, 1, 9, 6, 4, 3, 3, 5, 2, 3, + 10, 2, 10, 5, 2, 9, 9, 1, 5, 4, 6, 3, 4, 10, 2, 9, 6, + 10, 3, 4, 5, 6, 5, 2, 1, 6, 3, 3, 6, 1, 3, 9, 6, 5, + 5, 1, 3, 2, 3, 7, 7, 4, 10, 4, 9, 9, 6, 2, 10, 2, 6, + 8, 5, 1, 10, 1, 9, 1, 3, 2, 4, 2, 7, 1, 10, 6, 9, 10, + 2, 1, 7, 10, 9, 2, 10, 4, 7, 8, 1, 5, 10, 4, 6, 4, 3, + 4, 3, 5, 1, 9, 2, 7, 9, 6, 1, 7, 8, 3, 6, 5, 2, 3, + 6, 8, 2, 6, 1, 8, 3, 10, 3, 4, 7, 2, 1, 3, 8, 4, 4, + 7, 4, 1, 8, 7, 1, 2, 10, 10, 6, 8, 3, 9, 4, 3, 3, 3, + 5, 10, 5, 4, 5, 3, 6, 1, 9, 5, 9, 4, 5, 7, 5, 8, 5, + 7, 6, 4, 1, 7, 1, 2, 8, 1, 5, 2, 2, 1, 4, 10, 10, 5, + 8, 5, 6, 4, 2, 5, 6, 5, 1, 5, 10, 6, 10, 1, 1, 7, 4, + 8, 9, 7, 10, 3, 8, 6, 5, 7, 2, 1, 3, 2, 5, 5, 4, 2, + 4, 5, 9, 7, 9, 8, 4, 6, 6, 3, 1, 8, 9, 1, 7, 9, 1, + 8, 4, 4, 4, 3, 8, 9, 7, 6, 3, 10, 7, 2, 4, 7, 5, 4, + 9, 7, 6, 4, 6, 5, 1, 7, 2, 8, 2, 5, 3, 3, 2, 4, 10, + 1, 4, 9, 9, 3, 4, 1, 10, 2, 1, 1, 9, 3, 5, 3, 1, 1, + 3, 10, 3, 7, 10, 6, 7, 10, 6, 4, 9, 5, 3, 8, 8, 6, 9, + 6, 6, 5, 1, 8, 1, 8, 2, 2, 6, 10, 3, 7, 4, 2, 7, 4, + 1, 2, 1, 8, 9, 2, 10, 8, 5, 2, 6, 9, 4, 4, 9, 8, 7, + 7, 5, 6, 9, 6, 4, 1, 3, 2, 8, 3, 6, 1, 9, 2, 3, 4, + 3, 2, 7, 7, 3, 1, 2, 2, 1, 3, 7, 5, 5, 1, 4, 2, 10, + 3, 8, 1, 6, 4, 7, 4, 2, 9, 3, 8, 7, 9, 5, 6, 9, 6, + 10, 4, 8, 2, 10, 2, 8, 6, 5, 8, 2, 5, 3, 8, 1, 6, 2, + 7, 3, 5, 9, 2, 5, 7, 2, 1, 6, 9, 3, 7, 1, 4, 10, 6, + 3, 9, 8, 8, 10, 7, 2, 2, 3, 7, 4, 8, 7, 6, 9, 2, 8, + 2, 3, 3, 5, 9, 2, 2, 4, 9, 9, 7, 5, 2, 2, 3, 4, 3, + 6, 7, 9, 2, 1, 9, 4, 8, 3, 7, 6, 6, 7, 3, 9, 4, 6, + 7, 4, 1, 3, 3, 5, 10, 1, 5, 4, 4, 9, 4, 4, 9, 1, 1, + 1, 3, 5, 1, 6, 2, 2, 4, 6, 7, 9, 1, 7, 3, 1, 8, 9, + 1, 5, 5, 6, 7, 8, 5, 1, 2, 4, 9, 9, 9, 7, 10, 10, 7, + 10, 5, 1, 8, 10, 5, 6, 4, 5, 8, 2, 3, 7, 9, 3, 9, 4, + 7, 5, 8, 1, 9, 2, 9, 1, 6, 10, 3, 3, 9, 6 \ No newline at end of file diff --git a/demos/test_acorn.cpp b/demos/test_acorn.cpp new file mode 100644 index 0000000..91038a3 --- /dev/null +++ b/demos/test_acorn.cpp @@ -0,0 +1,442 @@ +#include +#include +#include +#include +#include + + +#include + +#include +#include +#include +#include + +#include +#include +#include + + +// added these +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // for ostringstream +#include +#include +#include +#include /* assert */ +#include +#include +#include +#include // for std::accumulate +#include // for std::mean and std::stdev +#include +#include "utils.cpp" + + + + +// create indices for debugging, write indices to file, and get recall stats for all queries +int main(int argc, char *argv[]) { + unsigned int nthreads = std::thread::hardware_concurrency(); + std::cout << "====================\nSTART: running TEST_ACORN for hnsw, sift data --" << nthreads << "cores\n" << std::endl; + // printf("====================\nSTART: running MAKE_INDICES for hnsw --...\n"); + double t0 = elapsed(); + + int efc = 40; // default is 40 + int efs = 16; // default is 16 + int k = 10; // search parameter + size_t d = 128; // dimension of the vectors to index - will be overwritten by the dimension of the dataset + int M; // HSNW param M TODO change M back + int M_beta; // param for compression + // float attr_sel = 0.001; + // int gamma = (int) 1 / attr_sel; + int gamma; + int n_centroids; + // int filter = 0; + std::string dataset; // must be sift1B or sift1M or tripclick + int test_partitions = 0; + int step = 10; //2 + + std::string assignment_type = "rand"; + int alpha = 0; + + srand(0); // seed for random number generator + int num_trials = 60; + + + size_t N = 0; // N will be how many we truncate nb from sift1M to + + int opt; + {// parse arguments + + if (argc < 6 || argc > 8) { + fprintf(stderr, "Syntax: %s [] [] \n", argv[0]); + exit(1); + } + + N = strtoul(argv[1], NULL, 10); + printf("N: %ld\n", N); + + + gamma = atoi(argv[2]); + printf("gamma: %d\n", gamma); + + + + + dataset = argv[3]; + printf("dataset: %s\n", dataset.c_str()); + if (dataset != "sift1M" && dataset != "sift1M_test" && dataset != "sift1B" && dataset != "tripclick" && dataset != "paper" && dataset != "paper_rand2m") { + printf("got dataset: %s\n", dataset.c_str()); + fprintf(stderr, "Invalid ; must be a value in [sift1M, sift1B]\n"); + exit(1); + } + + M = atoi(argv[4]); + printf("M: %d\n", M); + + M_beta = atoi(argv[5]); + printf("M_beta: %d\n", M_beta); + + } + + + + + // load metadata + n_centroids = gamma; + + std::vector metadata = load_ab(dataset, gamma, assignment_type, N); + metadata.resize(N); + assert(N == metadata.size()); + printf("[%.3f s] Loaded metadata, %ld attr's found\n", + elapsed() - t0, metadata.size()); + + + + size_t nq; + float* xq; + std::vector aq; + { // load query vectors and attributes + printf("[%.3f s] Loading query vectors and attributes\n", elapsed() - t0); + + size_t d2; + // xq = fvecs_read("sift1M/sift_query.fvecs", &d2, &nq); + bool is_base = 0; + // load_data(dataset, is_base, &d2, &nq, xq); + std::string filename = get_file_name(dataset, is_base); + xq = fvecs_read(filename.c_str(), &d2, &nq); + assert(d == d2 || !"query does not have same dimension as expected 128"); + if (d != d2) { + d = d2; + } + + std::cout << "query vecs data loaded, with dim: " << d2 << ", nb=" << nq << std::endl; + printf("[%.3f s] Loaded query vectors from %s\n", elapsed() - t0, filename.c_str()); + aq = load_aq(dataset, n_centroids, alpha, N); + printf("[%.3f s] Loaded %ld %s queries\n", elapsed() - t0, nq, dataset.c_str()); + + } + // nq = 1; + int gt_size = 100; + if (dataset=="sift1M_test" || dataset=="paper") { + gt_size = 10; + } + std::vector gt(gt_size * nq); + { // load ground truth + gt = load_gt(dataset, gamma, alpha, assignment_type, N); + printf("[%.3f s] Loaded ground truth, gt_size: %d\n", elapsed() - t0, gt_size); + } + + // create normal (base) and hybrid index + printf("[%.3f s] Index Params -- d: %ld, M: %d, N: %ld, gamma: %d\n", + elapsed() - t0, d, M, N, gamma); + // base HNSW index + faiss::IndexHNSWFlat base_index(d, M, 1); // gamma = 1 + base_index.hnsw.efConstruction = efc; // default is 40 in HNSW.capp + base_index.hnsw.efSearch = efs; // default is 16 in HNSW.capp + + // ACORN-gamma + faiss::IndexACORNFlat hybrid_index(d, M, gamma, metadata, M_beta); + hybrid_index.acorn.efSearch = efs; // default is 16 HybridHNSW.capp + debug("ACORN index created%s\n", ""); + + + // ACORN-1 + faiss::IndexACORNFlat hybrid_index_gamma1(d, M, 1, metadata, M*2); + hybrid_index_gamma1.acorn.efSearch = efs; // default is 16 HybridHNSW.capp + + + + + { // populating the database + std::cout << "====================Vectors====================\n" << std::endl; + // printf("====================Vectors====================\n"); + + printf("[%.3f s] Loading database\n", elapsed() - t0); + + size_t nb, d2; + bool is_base = 1; + std::string filename = get_file_name(dataset, is_base); + float* xb = fvecs_read(filename.c_str(), &d2, &nb); + assert(d == d2 || !"dataset does not dim 128 as expected"); + printf("[%.3f s] Loaded base vectors from file: %s\n", elapsed() - t0, filename.c_str()); + + + + std::cout << "data loaded, with dim: " << d2 << ", nb=" << nb << std::endl; + + printf("[%.3f s] Indexing database, size %ld*%ld from max %ld\n", + elapsed() - t0, N, d2, nb); + + // index->add(nb, xb); + + printf("[%.3f s] Adding the vectors to the index\n", elapsed() - t0); + + base_index.add(N, xb); + printf("[%.3f s] Vectors added to base index \n", elapsed() - t0); + std::cout << "Base index vectors added: " << nb << std::endl; + + hybrid_index.add(N, xb); + printf("[%.3f s] Vectors added to hybrid index \n", elapsed() - t0); + std::cout << "Hybrid index vectors added" << nb << std::endl; + // printf("SKIPPED creating ACORN-gamma\n"); + + + hybrid_index_gamma1.add(N, xb); + printf("[%.3f s] Vectors added to hybrid index with gamma=1 \n", elapsed() - t0); + std::cout << "Hybrid index with gamma=1 vectors added" << nb << std::endl; + + + + delete[] xb; + } + + + // write hybrid index and partition indices to files + { + std::cout << "====================Write Index====================\n" << std::endl; + // write hybrid index + // std::string filename = "hybrid_index" + dataset + ".index"; + std::stringstream filepath_stream; + if (dataset == "sift1M" || dataset == "sift1B") { + filepath_stream << "./tmp/hybrid_" << (int) (N / 1000 / 1000) << "m_nc=" << n_centroids << "_assignment=" << assignment_type << "_alpha=" << alpha << ".json"; + + } else { + filepath_stream << "./tmp/" << dataset << "/hybrid" << "_M=" << M << "_efc" << efc << "_Mb=" << M_beta << "_gamma=" << gamma << ".json"; + } + std::string filepath = filepath_stream.str(); + write_index(&hybrid_index, filepath.c_str()); + printf("[%.3f s] Wrote hybrid index to file: %s\n", elapsed() - t0, filepath.c_str()); + + // write hybrid_gamma1 index + std::stringstream filepath_stream2; + if (dataset == "sift1M" || dataset == "sift1B") { + filepath_stream2 << "./tmp/hybrid_gamma1_" << (int) (N / 1000 / 1000) << "m_nc=" << n_centroids << "_assignment=" << assignment_type << "_alpha=" << alpha << ".json"; + + } else { + filepath_stream2 << "./tmp/" << dataset << "/hybrid" << "_M=" << M << "_efc" << efc << "_Mb=" << M_beta << "_gamma=" << 1 << ".json"; + } + std::string filepath2 = filepath_stream2.str(); + write_index(&hybrid_index_gamma1, filepath2.c_str()); + printf("[%.3f s] Wrote hybrid_gamma1 index to file: %s\n", elapsed() - t0, filepath2.c_str()); + + + { // write base index + std::stringstream filepath_stream; + if (dataset == "sift1M" || dataset == "sift1B") { + filepath_stream << "./tmp/base_" << (int) (N / 1000 / 1000) << "m_nc=" << n_centroids << "_assignment=" << assignment_type << "_alpha=" << alpha << ".json"; + + } else { + filepath_stream << "./tmp/" << dataset << "/base" << "_M=" << M << "_efc=" << efc << ".json"; + } + std::string filepath = filepath_stream.str(); + write_index(&base_index, filepath.c_str()); + printf("[%.3f s] Wrote base index to file: %s\n", elapsed() - t0, filepath.c_str()); + } + + + + + + + + } + + { // print out stats + printf("====================================\n"); + printf("============ BASE INDEX =============\n"); + printf("====================================\n"); + base_index.printStats(false); + printf("====================================\n"); + printf("============ ACORN INDEX =============\n"); + printf("====================================\n"); + hybrid_index.printStats(false); + + } + + + printf("==============================================\n"); + printf("====================Search Results====================\n"); + printf("==============================================\n"); + // double t1 = elapsed(); + printf("==============================================\n"); + printf("====================Search====================\n"); + printf("==============================================\n"); + double t1 = elapsed(); + + { // searching the base database + printf("====================HNSW INDEX====================\n"); + printf("[%.3f s] Searching the %d nearest neighbors " + "of %ld vectors in the index, efsearch %d\n", + elapsed() - t0, + k, + nq, + base_index.hnsw.efSearch); + + std::vector nns(k * nq); + std::vector dis(k * nq); + + std::cout << "here1" << std::endl; + std::cout << "nn and dis size: " << nns.size() << " " << dis.size() << std::endl; + + + + double t1 = elapsed(); + base_index.search(nq, xq, k, dis.data(), nns.data()); + double t2 = elapsed(); + + printf("[%.3f s] Query results (vector ids, then distances):\n", + elapsed() - t0); + + // take max of 5 and nq + int nq_print = std::min(5, (int) nq); + for (int i = 0; i < nq_print; i++) { + printf("query %2d nn's: ", i); + for (int j = 0; j < k; j++) { + // printf("%7ld (%d) ", nns[j + i * k], metadata.size()); + printf("%7ld (%d) ", nns[j + i * k], metadata[nns[j + i * k]]); + } + printf("\n dis: \t"); + for (int j = 0; j < k; j++) { + printf("%7g ", dis[j + i * k]); + } + printf("\n"); + // exit(0); + } + + printf("[%.3f s] *** Query time: %f\n", + elapsed() - t0, t2 - t1); + + // print number of distance computations + // printf("[%.3f s] *** Number of distance computations: %ld\n", + // elapsed() - t0, base_index.ntotal * nq); + std::cout << "finished base index examples" << std::endl; + + } + + {// look at stats + // const faiss::HybridHNSWStats& stats = index.hnsw_stats; + const faiss::HNSWStats& stats = faiss::hnsw_stats; + + std::cout << "============= BASE HNSW QUERY PROFILING STATS =============" << std::endl; + printf("[%.3f s] Timing results for search of k=%d nearest neighbors of nq=%ld vectors in the index\n", + elapsed() - t0, + k, + nq); + std::cout << "n1: " << stats.n1 << std::endl; + std::cout << "n2: " << stats.n2 << std::endl; + std::cout << "n3 (number distance comps at level 0): " << stats.n3 << std::endl; + std::cout << "ndis: " << stats.ndis << std::endl; + std::cout << "nreorder: " << stats.nreorder << std::endl; + printf("average distance computations per query: %f\n", (float)stats.n3 / stats.n1); + + } + + { // searching the hybrid database + printf("==================== ACORN INDEX ====================\n"); + printf("[%.3f s] Searching the %d nearest neighbors " + "of %ld vectors in the index, efsearch %d\n", + elapsed() - t0, + k, + nq, + hybrid_index.acorn.efSearch); + + std::vector nns2(k * nq); + std::vector dis2(k * nq); + + // create filter_ids_map, ie a bitmap of the ids that are in the filter + std::vector filter_ids_map(nq * N); + for (int xq = 0; xq < nq; xq++) { + for (int xb = 0; xb < N; xb++) { + filter_ids_map[xq * N + xb] = (bool) (metadata[xb] == aq[xq]); + } + } + + double t1_x = elapsed(); + hybrid_index.search(nq, xq, k, dis2.data(), nns2.data(), filter_ids_map.data()); // TODO change first argument back to nq + double t2_x = elapsed(); + + + printf("[%.3f s] Query results (vector ids, then distances):\n", + elapsed() - t0); + + int nq_print = std::min(5, (int) nq); + for (int i = 0; i < nq_print; i++) { + printf("query %2d nn's (%d): ", i, aq[i]); + for (int j = 0; j < k; j++) { + printf("%7ld (%d) ", nns2[j + i * k], metadata[nns2[j + i * k]]); + } + printf("\n dis: \t"); + for (int j = 0; j < k; j++) { + printf("%7g ", dis2[j + i * k]); + } + printf("\n"); + } + + + printf("[%.3f s] *** Query time: %f\n", + elapsed() - t0, t2_x - t1_x); + + + + std::cout << "finished hybrid index examples" << std::endl; + } + + + + + + // check here + + {// look at stats + // const faiss::HybridHNSWStats& stats = index.hnsw_stats; + const faiss::ACORNStats& stats = faiss::acorn_stats; + + std::cout << "============= ACORN QUERY PROFILING STATS =============" << std::endl; + printf("[%.3f s] Timing results for search of k=%d nearest neighbors of nq=%ld vectors in the index\n", + elapsed() - t0, + k, + nq); + std::cout << "n1: " << stats.n1 << std::endl; + std::cout << "n2: " << stats.n2 << std::endl; + std::cout << "n3 (number distance comps at level 0): " << stats.n3 << std::endl; + std::cout << "ndis: " << stats.ndis << std::endl; + std::cout << "nreorder: " << stats.nreorder << std::endl; + printf("average distance computations per query: %f\n", (float)stats.n3 / stats.n1); + + } + + + + printf("[%.3f s] -----DONE-----\n", elapsed() - t0); +} \ No newline at end of file diff --git a/demos/utils.cpp b/demos/utils.cpp new file mode 100644 index 0000000..e3455be --- /dev/null +++ b/demos/utils.cpp @@ -0,0 +1,563 @@ +#include +#include +#include +#include +#include + + +#include + +#include +#include +#include +#include + +#include +#include +#include + + +// added these +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // for ostringstream +#include +#include +#include +#include /* assert */ +#include +#include +#include +#include // for std::accumulate +#include // for std::mean and std::stdev +#include +// #include +// for convenience +using json = nlohmann::json; +/** + * To run this demo, please download the ANN_SIFT1M dataset from + * + * http://corpus-texmex.irisa.fr/ + -> wget -r ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz + -> cd ftp.irisa.fr/local/texmex/corpus + -> tar -xf sift.tar.gz + + * and unzip it to the sudirectory sift1M. + **/ + +// MACRO +#define TESTING_DATA_DIR "./testing_data" + + +#include +#include +#include +#include +#include + +// using namespace std; + + + + + + + /***************************************************** + * I/O functions for fvecs and ivecs + *****************************************************/ + +bool fileExists(const std::string& filePath) { + std::ifstream file(filePath); + return file.good(); +} + + +float* fvecs_read(const char* fname, size_t* d_out, size_t* n_out) { + FILE* f = fopen(fname, "r"); + if (!f) { + fprintf(stderr, "could not open %s\n", fname); + perror(""); + abort(); + } + int d; + fread(&d, 1, sizeof(int), f); + assert((d > 0 && d < 1000000) || !"unreasonable dimension"); + fseek(f, 0, SEEK_SET); + struct stat st; + fstat(fileno(f), &st); + size_t sz = st.st_size; + assert(sz % ((d + 1) * 4) == 0 || !"weird file size"); + size_t n = sz / ((d + 1) * 4); + + *d_out = d; + *n_out = n; + float* x = new float[n * (d + 1)]; + size_t nr = fread(x, sizeof(float), n * (d + 1), f); + assert(nr == n * (d + 1) || !"could not read whole file"); + + // shift array to remove row headers + for (size_t i = 0; i < n; i++) + memmove(x + i * d, x + 1 + i * (d + 1), d * sizeof(*x)); + + fclose(f); + return x; +} + +// not very clean, but works as long as sizeof(int) == sizeof(float) +int* ivecs_read(const char* fname, size_t* d_out, size_t* n_out) { + return (int*)fvecs_read(fname, d_out, n_out); +} + + + +// get file name to load data vectors from +std::string get_file_name(std::string dataset, bool is_base) { + if (dataset == "sift1M" || dataset == "sift1M_test") { + return std::string("./Datasets/sift1M/sift_") + (is_base ? "base" : "query") + ".fvecs"; + } else if (dataset == "sift1B") { + return std::string("./Datasets/sift1B/bigann_") + (is_base ? "base_10m" : "query") + ".fvecs"; + } else if (dataset == "tripclick") { + return std::string("./Datasets/tripclick/") + (is_base ? "base_vecs_tripclick" : "query_vecs_tripclick_min100") + ".fvecs"; + } else if (dataset == "paper" || dataset == "paper_rand2m") { + return std::string("./Datasets/paper/") + (is_base ? "paper_base" : "paper_query") + ".fvecs"; + } else { + std::cerr << "Invalid datset in get_file_name" << std::endl; + return ""; + } +} + +// return name is in arg file_path +void get_index_name(int N, int n_centroids, std::string assignment_type, float alpha, int M_beta, std::string& file_path) { + std::stringstream filepath_stream; + filepath_stream << "./tmp/hybrid_" << (int) (N / 1000 / 1000) << "m_nc=" << n_centroids << "_assignment=" << assignment_type << "_alpha=" << alpha << "Mb=" << M_beta << ".json"; + // copy filepath_stream to file_path + file_path = filepath_stream.str(); +} + + + + + +/******************************************************* + * Added for debugging + *******************************************************/ +const int debugFlag = 1; + +void debugTime() { + if (debugFlag) { + struct timeval tval; + gettimeofday(&tval, NULL); + struct tm *tm_info = localtime(&tval.tv_sec); + char timeBuff[25] = ""; + strftime(timeBuff, 25, "%H:%M:%S", tm_info); + char timeBuffWithMilli[50] = ""; + sprintf(timeBuffWithMilli, "%s.%06ld ", timeBuff, tval.tv_usec); + std::string timestamp(timeBuffWithMilli); + std::cout << timestamp << std::flush; + } +} + +//needs atleast 2 args always +// alt debugFlag = 1 // fprintf(stderr, fmt, __VA_ARGS__); +#define debug(fmt, ...) \ + do { \ + if (debugFlag == 1) { \ + fprintf(stdout, "--" fmt, __VA_ARGS__);\ + } \ + if (debugFlag == 2) { \ + debugTime(); \ + fprintf(stdout, "%s:%d:%s(): " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); \ + } \ + } while (0) + + + +double elapsed() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + tv.tv_usec * 1e-6; +} + +/******************************************************* + * performance testing helpers + *******************************************************/ +std::pair get_mean_and_std(std::vector& times) { + // compute mean + float total = 0; + // for (int num: times) { + for (int i=0; i < times.size(); i++) { + // printf("%f, ", times[i]); // for debugging + total = total + times[i]; + } + float mean = (total / times.size()); + + // compute stdev from variance, using computed mean + float result = 0; + for (int i=0; i < times.size(); i++) { + result = result + (times[i] - mean)*(times[i] - mean); + } + float variance = result / (times.size() - 1); + // for debugging + // printf("variance: %f\n", variance); + + float std = std::sqrt(variance); + + // return + return std::make_pair(mean, std); +} + + + + +// ground truth labels @gt, results to evaluate @I with @nq queries, returns @gt_size-Recall@k where gt had max gt_size NN's per query +float compute_recall(std::vector& gt, int gt_size, std::vector& I, int nq, int k, int gamma=1) { + // printf("compute_recall params: gt.size(): %ld, gt_size: %d, I.size(): %ld, nq: %d, k: %d, gamma: %d\n", gt.size(), gt_size, I.size(), nq, k, gamma); + + int n_1 = 0, n_10 = 0, n_100 = 0; + for (int i = 0; i < nq; i++) { // loop over all queries + // int gt_nn = gt[i * k]; + std::vector::const_iterator first = gt.begin() + i*gt_size; + std::vector::const_iterator last = gt.begin() + i*gt_size + (k / gamma); + std::vector gt_nns_tmp(first, last); + // if (gt_nns_tmp.size() > 10) { + // printf("gt_nns size: %ld\n", gt_nns_tmp.size()); + // } + + // gt_nns_tmp.resize(k); // truncate if gt_size > k + std::set gt_nns(gt_nns_tmp.begin(), gt_nns_tmp.end()); + // if (gt_nns.size() > 10) { + // printf("gt_nns size: %ld\n", gt_nns.size()); + // } + + + for (int j = 0; j < k; j++) { // iterate over returned nn results + if (gt_nns.count(I[i * k + j])!=0) { + // if (I[i * k + j] == gt_nn) { + if (j < 1 * gamma) + n_1++; + if (j < 10 * gamma) + n_10++; + if (j < 100 * gamma) + n_100++; + } + } + } + // BASE ACCURACY + // printf("* Base HNSW accuracy relative to exact search:\n"); + // printf("\tR@1 = %.4f\n", n_1 / float(nq) ); + // printf("\tR@10 = %.4f\n", n_10 / float(nq)); + // printf("\tR@100 = %.4f\n", n_100 / float(nq)); // not sure why this is always same as R@10 + // printf("\t---Results for %ld queries, k=%d, N=%ld, gt_size=%d\n", nq, k, N, gt_size); + return (n_10 / float(nq)); + +} + + +template +void log_values(std::string annotation, std::vector& values) { + std::cout << annotation; + for (int i = 0; i < values.size(); i++) { + std::cout << values[i]; + if (i < values.size() - 1) { + std::cout << ", "; + } + } + std::cout << std::endl; +} + + + + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// +// FOR CORRELATION TESTING +// +///////////////////////////////////////////////////////////////////////////////////////////////////////// +template +std::vector load_json_to_vector(std::string filepath) { + // Open the JSON file + std::ifstream file(filepath); + if (!file.is_open()) { + std::cerr << "Failed to open JSON file" << std::endl; + // return 1; + } + + // Parse the JSON data + json data; + try { + file >> data; + } catch (const std::exception& e) { + std::cerr << "Failed to parse JSON data from " << filepath << ": " << e.what() << std::endl; + // return 1; + } + + // Convert data to a vector + std::vector v = data.get>(); + + // print size + std::cout << "metadata or vector loaded from json, size: " << v.size() << std::endl; + return v; +} + + +std::vector load_aq(std::string dataset, int n_centroids, int alpha, int N) { + if (dataset == "sift1M" || dataset == "sift1B") { + assert((alpha == -2 || alpha == 0 || alpha == 2) || !"alpha must be value in [-2, 0, 2]"); + + // Compose File Name + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/query_filters_sift" << (int) (N / 1000 / 1000) << "m_nc=" << n_centroids << "_alpha=" << alpha << ".json"; + std::string filepath = filepath_stream.str(); + + std::vector v = load_json_to_vector(filepath); + printf("loaded query attributes from: %s\n", filepath.c_str()); + + + // // print out data for debugging + // for (int i : v) { + // std::cout << i << " "; + // } + // std::cout << std::endl; + + return v; + } else if (dataset == "tripclick") { + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/query_filters_tripclick_sample_subset_min100.json"; + std::string filepath = filepath_stream.str(); + // printf("%s\n", filepath.c_str()); + + std::vector v = load_json_to_vector(filepath); + printf("loaded query attributes from: %s\n", filepath.c_str()); + + + // // print out data for debugging + // for (int i : v) { + // std::cout << i << " "; + // } + // std::cout << std::endl; + + return v; + } else if (dataset == "sift1M_test") { + // return a vector of all int 5 with lenght N + std::vector v(N, 5); + printf("made query filters with value %d, length %ld\n", v[0], v.size()); + return v; + + } else if (dataset == "paper") { + std::vector v(N, 5); + printf("made query filters with value %d, length %ld\n", v[0], v.size()); + return v; + } else if (dataset == "paper_rand2m") { + // Compose File Name + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/query_filters_paper_rand2m_nc=12_alpha=0.json"; + std::string filepath = filepath_stream.str(); + + std::vector v = load_json_to_vector(filepath); + printf("loaded query attributes from: %s\n", filepath.c_str()); + return v; + + } else { + std::cerr << "Invalid dataset in load_aq" << std::endl; + return std::vector(); + } + + +} + +// assignment_type can be "rand", "soft", "soft_squared", "hard" +std::vector load_ab(std::string dataset, int n_centroids, std::string assignment_type, int N) { + // Compose File Name + if (dataset == "sift1M" || dataset == "sift1B") { + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/base_attrs_sift" << (int) (N / 1000 / 1000) << "m_nc=" << n_centroids << "_assignment=" << assignment_type << ".json"; + std::string filepath = filepath_stream.str(); + // printf("%s\n", filepath.c_str()); + + std::vector v = load_json_to_vector(filepath); + printf("loaded base attributes from: %s\n", filepath.c_str()); + + + // // print out data for debugging + // for (int i : v) { + // std::cout << i << " "; + // } + // std::cout << std::endl; + + return v; + } else if (dataset == "sift1M_test") { + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/sift_attr" << ".json"; + std::string filepath = filepath_stream.str(); + // printf("%s\n", filepath.c_str()); + + std::vector v = load_json_to_vector(filepath); + printf("loaded base attributes from: %s\n", filepath.c_str()); + return v; + + } else if (dataset == "paper") { + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/paper_attr.json"; + std::string filepath = filepath_stream.str(); + // printf("%s\n", filepath.c_str()); + + std::vector v = load_json_to_vector(filepath); + printf("loaded base attributes from: %s\n", filepath.c_str()); + + return v; + + } else if (dataset == "paper_rand2m") { + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/base_attrs_paper_rand2m_nc=12_assignment=rand.json"; + std::string filepath = filepath_stream.str(); + + std::vector v = load_json_to_vector(filepath); + printf("loaded base attributes from: %s\n", filepath.c_str()); + + return v; + } else if (dataset == "tripclick") { + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/base_attrs_tripclick.json"; + std::string filepath = filepath_stream.str(); + // printf("%s\n", filepath.c_str()); + + std::vector v = load_json_to_vector(filepath); + printf("loaded base attributes from: %s\n", filepath.c_str()); + + + // // print out data for debugging + // for (int i : v) { + // std::cout << i << " "; + // } + // std::cout << std::endl; + + return v; + } else { + std::cerr << "Invalid dataset in load_ab" << std::endl; + return std::vector(); + } + + +} + +// assignment_type can be "rand", "soft", "soft_squared", "hard" +// alpha can be -2, 0, 2 +std::vector load_gt(std::string dataset, int n_centroids, int alpha, std::string assignment_type, int N) { + if (dataset == "sift1M" || dataset == "sift1B") { + // Compose File Name + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/gt_sift" << (int) (N / 1000 / 1000) << "m_nc=" << n_centroids << "_assignment=" << assignment_type << "_alpha=" << alpha << ".json"; + std::string filepath = filepath_stream.str(); + // printf("%s\n", filepath.c_str()); + + std::vector v_tmp = load_json_to_vector(filepath); + std::vector v(v_tmp.begin(), v_tmp.end()); + printf("loaded gt from: %s\n", filepath.c_str()); + + // // print out data for debugging + // for (faiss::idx_t i : v) { + // std::cout << i << " "; + // } + // std::cout << std::endl; + + return v; + } else if (dataset == "sift1M_test") { + // Compose File Name + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/sift_gt_5.json"; + std::string filepath = filepath_stream.str(); + // printf("%s\n", filepath.c_str()); + + std::vector v_tmp = load_json_to_vector(filepath); + std::vector v(v_tmp.begin(), v_tmp.end()); + printf("loaded gt from: %s\n", filepath.c_str()); + + // // print out data for debugging + // for (faiss::idx_t i : v) { + // std::cout << i << " "; + // } + // std::cout << std::endl; + + return v; + + } else if (dataset == "paper") { + // Compose File Name + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/paper_gt_5.json"; + std::string filepath = filepath_stream.str(); + // printf("%s\n", filepath.c_str()); + + std::vector v_tmp = load_json_to_vector(filepath); + std::vector v(v_tmp.begin(), v_tmp.end()); + printf("loaded gt from: %s\n", filepath.c_str()); + + // // print out data for debugging + // for (faiss::idx_t i : v) { + // std::cout << i << " "; + // } + // std::cout << std::endl; + + return v; + + } else if (dataset == "paper_rand2m") { + // Compose File Name + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/gt_paper_rand2m_nc=12_assignment=rand_alpha=0.json"; + std::string filepath = filepath_stream.str(); + // printf("%s\n", filepath.c_str()); + + std::vector v_tmp = load_json_to_vector(filepath); + std::vector v(v_tmp.begin(), v_tmp.end()); + printf("loaded gt from: %s\n", filepath.c_str()); + + return v; + } else if (dataset == "tripclick") { + std::stringstream filepath_stream; + filepath_stream << TESTING_DATA_DIR << "/gt_tripclick_sample_subset_min100.json"; + std::string filepath = filepath_stream.str(); + // printf("%s\n", filepath.c_str()); + + std::vector v_tmp = load_json_to_vector(filepath); + std::vector v(v_tmp.begin(), v_tmp.end()); + printf("loaded gt from: %s\n", filepath.c_str()); + + // // print out data for debugging + // for (faiss::idx_t i : v) { + // std::cout << i << " "; + // } + // std::cout << std::endl; + + return v; + } else { + std::cerr << "Invalid dataset in load_gt" << std::endl; + return std::vector(); + } + + +} + + + + + + + + + + + + + + + + + +