Skip to content

Commit

Permalink
Adding more private_query functionality and fixes (FALCONN-LIB#87)
Browse files Browse the repository at this point in the history
* added some private query sets
* added paths to heldout queries and GT100 for all datasets except msspacev
* added spacev private queries
* updates
* isolating pqs changes for merge
* need to update main with the baseline config and algo from T3 for testing
* fix issue with failed extraction of search times
* using hyphens in cmd flags
* undoing/commenting the deep1b incorrect header HACK

* adding a priv run script for testing

* removed debug print

Co-authored-by: Harsha Vardhan Simhadri <[email protected]>
  • Loading branch information
sourcesync and harsha-simhadri authored Dec 4, 2021
1 parent 0c28a7e commit 343d48c
Show file tree
Hide file tree
Showing 8 changed files with 192 additions and 56 deletions.
6 changes: 5 additions & 1 deletion benchmark/algorithms/faiss_t3.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,10 @@ def produce_batches(self, x, bs):


def search(self, x, k):

if x.dtype!=np.float32: #GW- why do we need this now?
x = x.astype( np.float32 )

bs = self.search_bs
if self.vec_transform:
x = self.vec_transform(x)
Expand All @@ -257,7 +261,7 @@ def search(self, x, k):

def range_search(self, x, radius):

x = x.astype( np.float32 ) #GW
x = x.astype( np.float32 ) #GW - why do we need this now?

bs = self.search_bs
if self.vec_transform:
Expand Down
92 changes: 90 additions & 2 deletions benchmark/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ def ivecs_read(fname):
def xbin_mmap(fname, dtype, maxn=-1):
""" mmap the competition file format for a given type of items """
n, d = map(int, np.fromfile(fname, dtype="uint32", count=2))

# HACK - to handle improper header in file for private deep-1B
# if override_d and override_d != d:
# print("Warning: xbin_mmap map returned d=%s, but overridig with %d" % (d, override_d))
# d = override_d
# HACK

assert os.stat(fname).st_size == 8 + n * d * np.dtype(dtype).itemsize
if maxn > 0:
n = min(n, maxn)
Expand Down Expand Up @@ -188,6 +195,11 @@ def get_queries(self):
Return (nq, d) array containing the nq queries.
"""
pass
def get_private_queries(self):
"""
Return (private_nq, d) array containing the private_nq private queries.
"""
pass
def get_groundtruth(self, k=None):
"""
Return (nq, k) array containing groundtruth indices
Expand Down Expand Up @@ -254,6 +266,22 @@ def prepare(self, skip_data=False):
continue
download(sourceurl, outfile)

# private qs url
if self.private_qs_url:
outfile = os.path.join(self.basedir, self.private_qs_url.split("/")[-1])
if os.path.exists(outfile):
print("file %s already exists" % outfile)
else:
download(self.private_qs_url, outfile)

# private gt url
if self.private_gt_url:
outfile = os.path.join(self.basedir, self.private_gt_url.split("/")[-1])
if os.path.exists(outfile):
print("file %s already exists" % outfile)
else:
download(self.private_gt_url, outfile)

if skip_data:
return

Expand Down Expand Up @@ -325,6 +353,27 @@ def get_queries(self):
assert x.shape == (self.nq, self.d)
return sanitize(x)

def get_private_queries(self):
assert self.private_qs_url is not None
fn = self.private_qs_url.split("/")[-1] # in case it's a URL
filename = os.path.join(self.basedir, fn)
x = xbin_mmap(filename, dtype=self.dtype, override_d=self.d)
assert x.shape == (self.private_nq, self.d)
return sanitize(x)

def get_private_groundtruth(self, k=None):
assert self.private_gt_url is not None
fn = self.private_gt_url.split("/")[-1] # in case it's a URL
assert self.search_type() == "knn"

I, D = knn_result_read(os.path.join(self.basedir, fn))
assert I.shape[0] == self.private_nq
if k is not None:
assert k <= 100
I = I[:, :k]
D = D[:, :k]
return I, D

subset_url = "https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/"

class SSNPPDataset(DatasetCompetitionFormat):
Expand All @@ -337,7 +386,6 @@ def __init__(self, nb_M=1000):
self.dtype = "uint8"
self.ds_fn = "FB_ssnpp_database.u8bin"
self.qs_fn = "FB_ssnpp_public_queries.u8bin"

self.gt_fn = (
"FB_ssnpp_public_queries_1B_GT.rangeres" if self.nb_M == 1000 else
subset_url + "GT_100M/ssnpp-100M" if self.nb_M == 100 else
Expand All @@ -348,6 +396,10 @@ def __init__(self, nb_M=1000):
self.base_url = "https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/"
self.basedir = os.path.join(BASEDIR, "FB_ssnpp")

self.private_nq = 100000
self.private_qs_url = "https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/FB_ssnpp_heldout_queries_3307fba121460a56.u8bin"
self.private_gt_url = "https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/GT_1B_final_2bf4748c7817/FB_ssnpp.bin"

def search_type(self):
return "range"

Expand All @@ -362,6 +414,12 @@ def get_groundtruth(self, k=None):
assert self.gt_fn is not None
fn = self.gt_fn.split("/")[-1] # in case it's a URL
return range_result_read(os.path.join(self.basedir, fn))

def get_private_groundtruth(self, k=None):
""" override the ground-truth function as this is the only range search dataset """
assert self.private_gt_url is not None
fn = self.private_gt_url.split("/")[-1] # in case it's a URL
return range_result_read(os.path.join(self.basedir, fn))

class BigANNDataset(DatasetCompetitionFormat):
def __init__(self, nb_M=1000):
Expand All @@ -381,6 +439,11 @@ def __init__(self, nb_M=1000):
# self.gt_fn = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/bigann/public_query_gt100.bin" if self.nb == 10**9 else None
self.base_url = "https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/bigann/"
self.basedir = os.path.join(BASEDIR, "bigann")

self.private_nq = 10000
self.private_qs_url = "https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/bigann/query.private.799253207.10K.u8bin"
self.private_gt_url = "https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/GT_1B_final_2bf4748c7817/bigann-1B.bin"


def distance(self):
return "euclidean"
Expand All @@ -403,12 +466,18 @@ def __init__(self, nb_M=1000):
self.base_url = "https://storage.yandexcloud.net/yandex-research/ann-datasets/DEEP/"
self.basedir = os.path.join(BASEDIR, "deep1b")

self.private_nq = 30000
self.private_qs_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/deep1b/query.heldout.30K.fbin"
self.private_gt_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/deep1b/gt100-heldout.30K.fbin"

self.private_nq_large = 1000000
self.private_qs_large_url = "https://storage.yandexcloud.net/yr-secret-share/ann-datasets-5ac0659e27/DEEP/query.private.1M.fbin"

def distance(self):
return "euclidean"




class Text2Image1B(DatasetCompetitionFormat):
def __init__(self, nb_M=1000):
self.nb_M = nb_M
Expand All @@ -427,6 +496,13 @@ def __init__(self, nb_M=1000):
self.base_url = "https://storage.yandexcloud.net/yandex-research/ann-datasets/T2I/"
self.basedir = os.path.join(BASEDIR, "text2image1B")

self.private_nq = 30000
self.private_qs_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/text2image1b/query.heldout.30K.fbin"
self.private_gt_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/text2image1b/gt100-heldout.30K.fbin"

self.private_nq_large = 1000000
self.private_qs_large_url = "https://storage.yandexcloud.net/yr-secret-share/ann-datasets-5ac0659e27/T2I/query.private.1M.fbin"

def distance(self):
return "ip"

Expand Down Expand Up @@ -454,6 +530,14 @@ def __init__(self, nb_M=1000):
self.base_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/MSFT-TURING-ANNS/"
self.basedir = os.path.join(BASEDIR, "MSTuringANNS")

self.private_nq = 10000
self.private_qs_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/MSFT-TURING-ANNS/testQuery10K.fbin"
self.private_gt_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/MSFT-TURING-ANNS/gt100-private10K-queries.bin"

self.private_nq_large = 99605
self.private_qs_large_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/MSFT-TURING-ANNS/testQuery99605.fbin"
self.private_gt_large_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/MSFT-TURING-ANNS/gt100-private99605-queries.bin"

def distance(self):
return "euclidean"

Expand All @@ -476,6 +560,10 @@ def __init__(self, nb_M=1000):
self.base_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/spacev1b/"
self.basedir = os.path.join(BASEDIR, "MSSPACEV1B")

self.private_nq = 30000
self.private_qs_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/spacev1b/private_query_30k.bin"
self.private_gt_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/spacev1b/public_query_gt100.bin"

def distance(self):
return "euclidean"

Expand Down
16 changes: 12 additions & 4 deletions benchmark/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def positive_int(s):


def run_worker(args, queue):
print("RW", args)
while not queue.empty():
definition = queue.get()
memory_margin = 500e6 # reserve some extra memory for misc stuff
Expand All @@ -46,14 +47,16 @@ def run_worker(args, queue):
args.runs, args.timeout, args.rebuild, cpu_limit, mem_limit,
args.t3, args.power_capture,
args.upload_index, args.download_index,
args.blob_prefix, args.sas_string)
args.blob_prefix, args.sas_string,
args.private_query)

else:
run_docker(definition, args.dataset, args.count,
args.runs, args.timeout, args.rebuild, cpu_limit, mem_limit,
args.t3, args.power_capture,
args.upload_index, args.download_index,
args.blob_prefix, args.sas_string)
args.blob_prefix, args.sas_string,
args.private_query)


def main():
Expand Down Expand Up @@ -135,7 +138,7 @@ def main():
parser.add_argument(
'--download-index',
help='Download index uploaded to Azure blob storage and run local queries.',
action='store_true')
action='store_true')
parser.add_argument(
'--blob-prefix',
help='Azure blob prefix to upload indices to and download indices from.'
Expand All @@ -144,8 +147,13 @@ def main():
'--sas-string',
help='SAS string to authenticate to Azure blob storage.'
)
parser.add_argument(
'--private-query',
help='Use the new set of private queries that were not released during the competition period.',
action='store_true'
)



args = parser.parse_args()
if args.timeout == -1:
args.timeout = None
Expand Down
14 changes: 11 additions & 3 deletions benchmark/plotting/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import numpy
from benchmark.plotting.metrics import all_metrics as metrics
from benchmark.sensors.power_capture import power_capture
import traceback
import sys

def get_or_create_metrics(run):
if 'metrics' not in run:
Expand Down Expand Up @@ -67,12 +69,18 @@ def compute_metrics(true_nn, res, metric_1, metric_2,

return all_results

def compute_metrics_all_runs(dataset, res, recompute=False, \
sensor_metrics=False, search_times=False):
def compute_metrics_all_runs(dataset, res, recompute=False,
sensor_metrics=False, search_times=False,
private_query=False):

try:
true_nn = dataset.get_groundtruth()
if private_query:
true_nn = dataset.get_private_groundtruth()
else:
true_nn = dataset.get_groundtruth()
except:
print(f"Groundtruth for {dataset} not found.")
#traceback.print_exc()
return

search_type = dataset.search_type()
Expand Down
25 changes: 18 additions & 7 deletions benchmark/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ def run_individual_query(algo, X, distance, count, run_count, search_type):

def run(definition, dataset, count, run_count, rebuild,
upload_index=False, download_index=False,
blob_prefix="", sas_string=""):
blob_prefix="", sas_string="", private_query=False):

algo = instantiate_algorithm(definition)
assert not definition.query_argument_groups \
or hasattr(algo, "set_query_arguments"), """\
Expand All @@ -70,7 +70,10 @@ def run(definition, dataset, count, run_count, rebuild,

ds = DATASETS[dataset]()
#X_train = numpy.array(D['train'])
X = ds.get_queries()
if not private_query:
X = ds.get_queries()
else:
X = ds.get_private_queries()
distance = ds.distance()
search_type = ds.search_type()
print(f"Running {definition.algorithm} on {dataset}")
Expand Down Expand Up @@ -206,7 +209,10 @@ def run_from_cmdline(args=None):
parser.add_argument(
'--sas-string',
help='SAS string to authenticate to Azure blob storage.')

parser.add_argument(
'--private-query',
help='Use the new set of private queries that were not released during the competition period.',
action="store_true")

args = parser.parse_args(args)
algo_args = json.loads(args.build)
Expand All @@ -227,13 +233,14 @@ def run_from_cmdline(args=None):
disabled=False
)
run(definition, args.dataset, args.count, args.runs, args.rebuild,
args.upload_index, args.download_index, args.blob_prefix, args.sas_string)
args.upload_index, args.download_index, args.blob_prefix, args.sas_string,
args.private_query)


def run_docker(definition, dataset, count, runs, timeout, rebuild,
cpu_limit, mem_limit=None, t3=None, power_capture=None,
upload_index=False, download_index=False,
blob_prefix="", sas_string=""):
blob_prefix="", sas_string="", private_query=False):
cmd = ['--dataset', dataset,
'--algorithm', definition.algorithm,
'--module', definition.module,
Expand All @@ -252,6 +259,8 @@ def run_docker(definition, dataset, count, runs, timeout, rebuild,
cmd.append("--download-index")
cmd += ["--blob-prefix", blob_prefix]
cmd += ["--sas-string", sas_string]
if private_query==True:
cmd.append("--private-query")

cmd.append(json.dumps(definition.arguments))
cmd += [json.dumps(qag) for qag in definition.query_argument_groups]
Expand Down Expand Up @@ -313,7 +322,7 @@ def stream_logs():
def run_no_docker(definition, dataset, count, runs, timeout, rebuild,
cpu_limit, mem_limit=None, t3=False, power_capture=None,
upload_index=False, download_index=False,
blob_prefix="", sas_string=""):
blob_prefix="", sas_string="", private_query=False):
cmd = ['--dataset', dataset,
'--algorithm', definition.algorithm,
'--module', definition.module,
Expand All @@ -332,6 +341,8 @@ def run_no_docker(definition, dataset, count, runs, timeout, rebuild,
cmd.append("--download-index")
cmd += ["--blob-prefix", blob_prefix]
cmd += ["--sas-string", sas_string]
if private_query==True:
cmd.append("--private-query")

cmd.append(json.dumps(definition.arguments))
cmd += [json.dumps(qag) for qag in definition.query_argument_groups]
Expand Down
11 changes: 8 additions & 3 deletions data_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,20 @@
action='store_true',
help='Path to the output csv file')
parser.add_argument(

'--private-query',
help='Use the private queries and ground truth',
action='store_true')
parser.add_argument(
'--sensors',
action='store_true',
help='Export sensors data if available')
parser.add_argument(
'--search_times',
'--search-times',
action='store_true',
help='Export search times data if available')
parser.add_argument(
'--detect_caching',
'--detect-caching',
type=float,
default=None,
metavar="THRESHOLD",
Expand All @@ -51,7 +56,7 @@
dataset = DATASETS[dataset_name]()
results = load_all_results(dataset_name)
results = compute_metrics_all_runs(dataset, results, args.recompute, \
args.sensors, args.search_times)
args.sensors, args.search_times, args.private_query)
cleaned = []
for result in results:
if 'k-nn' in result:
Expand Down
Loading

0 comments on commit 343d48c

Please sign in to comment.