benchs

guestrin-lab · Jun 11, 2024 · b30fdc8 · b30fdc8
1 parent 63c0212
commit b30fdc8
Show file tree

Hide file tree

Showing 48 changed files with 9,838 additions and 0 deletions.
diff --git a/benchs/CMakeLists.txt b/benchs/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+
+add_executable(bench_ivf_selector EXCLUDE_FROM_ALL bench_ivf_selector.cpp)
+target_link_libraries(bench_ivf_selector PRIVATE faiss)
+
diff --git a/benchs/README.md b/benchs/README.md
diff --git a/benchs/bench_6bit_codec.cpp b/benchs/bench_6bit_codec.cpp
@@ -0,0 +1,81 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <omp.h>
+#include <cstdio>
+
+#include <benchmark/benchmark.h>
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+
+using namespace faiss;
+
+static void bench(benchmark::State& state) {
+    int d = 128;
+    int n = 2000;
+
+    std::vector<float> x(d * n);
+
+    float_rand(x.data(), d * n, 12345);
+
+    // make sure it's idempotent
+    ScalarQuantizer sq(d, ScalarQuantizer::QT_6bit);
+
+    omp_set_num_threads(1);
+
+    sq.train(n, x.data());
+
+    size_t code_size = sq.code_size;
+    state.counters["code_size"] = sq.code_size;
+
+    // encode
+    std::vector<uint8_t> codes(code_size * n);
+    sq.compute_codes(x.data(), codes.data(), n);
+
+    // decode
+    std::vector<float> x2(d * n);
+    sq.decode(codes.data(), x2.data(), n);
+
+    state.counters["sql2_recons_error"] =
+            fvec_L2sqr(x.data(), x2.data(), n * d) / n;
+
+    // encode again
+    std::vector<uint8_t> codes2(code_size * n);
+    sq.compute_codes(x2.data(), codes2.data(), n);
+
+    size_t ndiff = 0;
+    for (size_t i = 0; i < codes.size(); i++) {
+        if (codes[i] != codes2[i])
+            ndiff++;
+    }
+
+    state.counters["ndiff_for_idempotence"] = ndiff;
+
+    state.counters["code_size_two"] = codes.size();
+
+    std::unique_ptr<ScalarQuantizer::SQDistanceComputer> dc(
+            sq.get_distance_computer());
+    dc->codes = codes.data();
+    dc->code_size = sq.code_size;
+    state.counters["code_size_three"] = dc->code_size;
+
+    for (auto _ : state) {
+        float sum_dis = 0;
+        for (int i = 0; i < n; i++) {
+            dc->set_query(&x[i * d]);
+            for (int j = 0; j < n; j++) {
+                benchmark::DoNotOptimize(sum_dis += (*dc)(j));
+            }
+        }
+    }
+}
+// I think maybe n and d should be input arguments
+// for thigns to really make sense, idk.
+BENCHMARK(bench)->Iterations(20);
+BENCHMARK_MAIN();
diff --git a/benchs/bench_all_ivf/README.md b/benchs/bench_all_ivf/README.md
@@ -0,0 +1,20 @@
+# Benchmark of IVF variants
+
+This is a benchmark of IVF index variants, looking at compression vs. speed vs. accuracy. 
+The results are in [this wiki chapter](https://github.com/facebookresearch/faiss/wiki/Indexing-1G-vectors)
+
+
+The code is organized as: 
+
+- `datasets.py`: code to access the datafiles, compute the ground-truth and report accuracies
+
+- `bench_all_ivf.py`: evaluate one type of inverted file
+
+- `run_on_cluster_generic.bash`: call `bench_all_ivf.py` for all tested types of indices. 
+Since the number of experiments is quite large the script is structured so that the benchmark can be run on a cluster.
+
+- `parse_bench_all_ivf.py`: make nice tradeoff plots from all the results. 
+
+The code depends on Faiss and can use 1 to 8 GPUs to do the k-means clustering for large vocabularies. 
+
+It was run in October 2018 for the results in the wiki.