From 9a390237d2c7ecf28d9414afe98c3a62030eabbe Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Sun, 13 Feb 2022 18:50:17 +0200 Subject: [PATCH 01/77] added support for other algorithms --- ann_benchmarks/algorithms/definitions.py | 2 +- ann_benchmarks/main.py | 4 +-- ann_benchmarks/results.py | 14 +++++------ multirun.py | 32 ++++++++++++++++-------- 4 files changed, 31 insertions(+), 21 deletions(-) diff --git a/ann_benchmarks/algorithms/definitions.py b/ann_benchmarks/algorithms/definitions.py index 5c4ed31da..3b9c4300b 100644 --- a/ann_benchmarks/algorithms/definitions.py +++ b/ann_benchmarks/algorithms/definitions.py @@ -97,7 +97,7 @@ def get_unique_algorithms(definition_file): def get_definitions(definition_file, dimension, point_type="float", - distance_metric="euclidean", count=10, conn_params=dict()): + distance_metric="euclidean", count=10, conn_params={'host': None, 'port': None, 'auth': None, 'user': None, 'cluster': False}): definitions = _get_definitions(definition_file) algorithm_definitions = {} diff --git a/ann_benchmarks/main.py b/ann_benchmarks/main.py index 744dde61c..ed0fd3957 100644 --- a/ann_benchmarks/main.py +++ b/ann_benchmarks/main.py @@ -139,12 +139,12 @@ def main(): '--host', metavar='NAME', help='host name or IP', - default="localhost") + default=None) parser.add_argument( '--port', type=positive_int, help='the port "host" is listening on', - default=6379) + default=None) parser.add_argument( '--auth', '-a', metavar='PASSWORD', diff --git a/ann_benchmarks/results.py b/ann_benchmarks/results.py index c1306c927..7f679a0af 100644 --- a/ann_benchmarks/results.py +++ b/ann_benchmarks/results.py @@ -16,13 +16,13 @@ def get_result_filename(dataset=None, count=None, definition=None, d.append(str(count)) if definition: d.append(definition.algorithm + ('-batch' if batch_mode else '')) - if 'redisearch' in definition.algorithm: - prefix = re.sub(r'\W+', '_', json.dumps(query_arguments, sort_keys=True)).strip('_') - d.append(prefix + f'_client_{id}.hdf5') - else: - data = definition.arguments + query_arguments - d.append(re.sub(r'\W+', '_', json.dumps(data, sort_keys=True)) - .strip('_') + ".hdf5") + data = definition.arguments + query_arguments + for i in range(len(data)): + if isinstance(data[i], dict): + data[i] = {k:data[i][k] for k in data[i] if data[i][k] is not None} + data.append('client') + data.append(id) + d.append(re.sub(r'\W+', '_', json.dumps(data, sort_keys=True)).strip('_') + ".hdf5") return os.path.join(*d) diff --git a/multirun.py b/multirun.py index 24795b1e4..07c62c3db 100644 --- a/multirun.py +++ b/multirun.py @@ -24,12 +24,12 @@ parser.add_argument( '--host', help='host name or IP', - default='localhost') + default=None) parser.add_argument( '--port', type=positive_int, help='the port "host" is listening on', - default=6379) + default=None) parser.add_argument( '--auth', '-a', metavar='PASS', @@ -60,20 +60,28 @@ '--algorithm', metavar='ALGO', help='run redisearch with this algorithm', - default="hnsw") + default="redisearch-hnsw") parser.add_argument( '--cluster', action='store_true', help='working with a cluster') args = parser.parse_args() + isredis = True if 'redisearch' in args.algorithm else False - redis = RedisCluster if args.cluster else Redis - redis = redis(host=args.host, port=args.port, password=args.auth, username=args.user) + if isredis: + redis = RedisCluster if args.cluster else Redis + redis = redis(host=args.host, port=args.port, password=args.auth, username=args.user) + elif 'milvus' in args.algorithm: + if args.build_clients > 1: + print('milvus does not allow multi client build. running with one builder') + args.build_clients = 1 - base = 'python run.py --local --algorithm redisearch-' + args.algorithm.lower() + ' -k ' + str(args.count) + \ - ' --dataset ' + args.dataset + ' --host ' + str(args.host) + ' --port ' + str(args.port) + base = 'python run.py --local --algorithm ' + args.algorithm + ' -k ' + str(args.count) + \ + ' --dataset ' + args.dataset + if args.host: base += ' --host ' + str(args.host) + if args.port: base += ' --port ' + str(args.port) if args.user: base += ' --user ' + str(args.user) if args.auth: base += ' --auth ' + str(args.auth) if args.force: base += ' --force' @@ -92,15 +100,17 @@ print(f'total build time: {total_time}\n\n') fn = get_result_filename(args.dataset, args.count) + fn = path.join(fn, args.algorithm) if not path.isdir(fn): makedirs(fn) fn = path.join(fn, 'build_stats.hdf5') f = h5py.File(fn, 'w') f.attrs["build_time"] = total_time - if args.cluster: - f.attrs["index_size"] = -1 # TODO: get total size from all the shards - else: - f.attrs["index_size"] = redis.ft('ann_benchmark').info()['vector_index_sz_mb']*0x100000 + if isredis: + if args.cluster: + f.attrs["index_size"] = -1 # TODO: get total size from all the shards + else: + f.attrs["index_size"] = redis.ft('ann_benchmark').info()['vector_index_sz_mb']*0x100000 f.close() if args.test_clients > 0: From d565635a84362436f695bded71d319283f399e83 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Sun, 13 Feb 2022 18:50:50 +0200 Subject: [PATCH 02/77] re-write milvus.py file for thier new API --- algos.yaml | 15 +++++-- ann_benchmarks/algorithms/milvus.py | 66 ++++++++++++++++++++--------- 2 files changed, 58 insertions(+), 23 deletions(-) diff --git a/algos.yaml b/algos.yaml index de1071504..26bc1d3c9 100644 --- a/algos.yaml +++ b/algos.yaml @@ -240,14 +240,23 @@ float: # This run group produces 3 algorithm instances -- Annoy("angular", # 100), Annoy("angular", 200), and Annoy("angular", 400) -- each of # which will be used to run 12 different queries. - milvus: + milvus-hnsw: docker-tag: ann-benchmarks-milvus module: ann_benchmarks.algorithms.milvus constructor: Milvus - base-args: ["@metric"] + base-args: ["@metric", "@connection"] + run-groups: + milvus: + args: [['HNSW'], [{"M": 4, "efConstruction": 500}, {"M": 32, "efConstruction": 500}]] + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] # ef + milvus-ivf: + docker-tag: ann-benchmarks-milvus + module: ann_benchmarks.algorithms.milvus + constructor: Milvus + base-args: ["@metric", "@connection"] run-groups: milvus: - args: [['IVF_FLAT', 'IVF_SQ8'], [100, 300, 1000, 3000, 10000, 30000]] # nlist + args: [['IVF_FLAT', 'IVF_SQ8'], [{"nlist": 100}, {"nlist": 300}, {"nlist": 1000}, {"nlist": 3000}, {"nlist": 10000}, {"nlist": 30000}]] # nlist query-args: [[1, 3, 10, 30, 100, 300]] # nprobe (should be <= nlist) nearpy: disabled: true diff --git a/ann_benchmarks/algorithms/milvus.py b/ann_benchmarks/algorithms/milvus.py index 50b6940a6..d1194e738 100644 --- a/ann_benchmarks/algorithms/milvus.py +++ b/ann_benchmarks/algorithms/milvus.py @@ -1,45 +1,71 @@ from __future__ import absolute_import -import milvus +from sqlite3 import paramstyle +from pymilvus import ( + connections, + utility, + FieldSchema, + CollectionSchema, + DataType, + IndexType, + Collection, +) import numpy import sklearn.preprocessing from ann_benchmarks.algorithms.base import BaseANN class Milvus(BaseANN): - def __init__(self, metric, index_type, nlist): - self._nlist = nlist + def __init__(self, metric, conn_params, index_type, method_params): + self._host = conn_params['host'] + self._port = conn_params['port'] # 19530 + # connections.connect(host=conn_params['host'], port=conn_params['port']) + # fields = [ + # FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False), + # FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=100) + # ] + # schema = CollectionSchema(fields) + # self._milvus = Collection('milvus', schema) + self._index_type = index_type + self._method_params = method_params self._nprobe = None self._metric = metric - self._milvus = milvus.Milvus() - self._milvus.connect(host='localhost', port='19530') - self._table_name = 'test01' - self._index_type = index_type def fit(self, X): if self._metric == 'angular': - X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') + X = sklearn.preprocessing.normalize(X, axis=1) + + # TODO: if we can set the dim later, mabe return this to the init func + connections.connect(host=self._host, port=self._port) + fields = [ + FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False), + FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=len(X[0])) + ] + schema = CollectionSchema(fields) + self._milvus = Collection('milvus', schema) - self._milvus.create_table({'table_name': self._table_name, 'dimension': X.shape[1]}) - vector_ids = [id for id in range(len(X))] - self._milvus.insert(table_name=self._table_name, records=X.tolist(), ids=vector_ids) - index_type = getattr(milvus.IndexType, self._index_type) # a bit hacky but works - self._milvus.create_index(self._table_name, {'index_type': index_type, 'nlist': self._nlist}) + self._milvus.insert([[id for id in range(len(X))], X.tolist()]) + self._milvus.create_index('vector', {'index_type': self._index_type, 'metric_type':'L2', 'params':self._method_params}) + self._milvus.load() - def set_query_arguments(self, nprobe): - if nprobe > self._nlist: - print('warning! nprobe > nlist') - nprobe = self._nlist - self._nprobe = nprobe + def set_query_arguments(self, param): + self._query_params = dict() + if 'IVF_' in self._index_type: + if param > self._method_params['nlist']: + print('warning! nprobe > nlist') + param = self._method_params['nlist'] + self._query_params['nprobe'] = param + if 'HNSW' in self._index_type: + self._query_params['ef'] = param def query(self, v, n): if self._metric == 'angular': v /= numpy.linalg.norm(v) v = v.tolist() - status, results = self._milvus.search(table_name=self._table_name, query_records=[v], top_k=n, nprobe=self._nprobe) + results = self._milvus.search([v], 'vector', {'metric_type':'L2', 'params':self._query_params}, limit=n) if not results: return [] # Seems to happen occasionally, not sure why result_ids = [result.id for result in results[0]] return result_ids def __str__(self): - return 'Milvus(index_type=%s, nlist=%d, nprobe=%d)' % (self._index_type, self._nlist, self._nprobe) + return 'Milvus(index_type=%s, method_params=%s, query_params=%s)' % (self._index_type, str(self._method_params), str(self._nprobe)) From d6c06af46e3126deeb556b633f66646c863e30de Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Mon, 14 Feb 2022 11:47:10 +0200 Subject: [PATCH 03/77] added --run-group to run a specific parameters combination --- ann_benchmarks/algorithms/definitions.py | 5 +++-- ann_benchmarks/main.py | 13 +++++++++++-- multirun.py | 18 ++++++++++++------ 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/ann_benchmarks/algorithms/definitions.py b/ann_benchmarks/algorithms/definitions.py index 3b9c4300b..b4aea1bf2 100644 --- a/ann_benchmarks/algorithms/definitions.py +++ b/ann_benchmarks/algorithms/definitions.py @@ -12,7 +12,7 @@ Definition = collections.namedtuple( 'Definition', - ['algorithm', 'constructor', 'module', 'docker_tag', + ['algorithm', 'run_group', 'constructor', 'module', 'docker_tag', 'arguments', 'query_argument_groups', 'disabled']) @@ -116,7 +116,7 @@ def get_definitions(definition_file, dimension, point_type="float", if "base-args" in algo: base_args = algo["base-args"] - for run_group in algo["run-groups"].values(): + for run_group_name, run_group in algo["run-groups"].items(): if "arg-groups" in run_group: groups = [] for arg_group in run_group["arg-groups"]: @@ -163,6 +163,7 @@ def get_definitions(definition_file, dimension, point_type="float", aargs = [_substitute_variables(arg, vs) for arg in aargs] definitions.append(Definition( algorithm=name, + run_group = run_group_name, docker_tag=algo['docker-tag'], module=algo['module'], constructor=algo['constructor'], diff --git a/ann_benchmarks/main.py b/ann_benchmarks/main.py index ed0fd3957..aac871b55 100644 --- a/ann_benchmarks/main.py +++ b/ann_benchmarks/main.py @@ -74,6 +74,11 @@ def main(): metavar='NAME', help='run only the named algorithm', default=None) + parser.add_argument( + '--run-group', + metavar='NAME', + help='run only the named run group', + default=None) parser.add_argument( '--docker-tag', metavar='NAME', @@ -165,7 +170,7 @@ def main(): '--client-id', metavar='NUM', type=positive_int, - help='specific client id (among the total client)', + help='specific client id (among the total clients)', default=1) args = parser.parse_args() @@ -226,9 +231,13 @@ def main(): random.shuffle(definitions) if args.algorithm: - logger.info(f'running only {args.algorithm}') + logger.info(f'running only {args.algorithm} algorithms') definitions = [d for d in definitions if d.algorithm == args.algorithm] + if args.run_group: + logger.info(f'running only {args.run_group} run groups') + definitions = [d for d in definitions if d.run_group == args.run_group] + if not args.local: # See which Docker images we have available docker_client = docker.from_env() diff --git a/multirun.py b/multirun.py index 07c62c3db..c48561d1b 100644 --- a/multirun.py +++ b/multirun.py @@ -61,6 +61,11 @@ metavar='ALGO', help='run redisearch with this algorithm', default="redisearch-hnsw") + parser.add_argument( + '--run-group', + metavar='NAME', + help='run only the named run group', + default=None) parser.add_argument( '--cluster', action='store_true', @@ -80,12 +85,13 @@ base = 'python run.py --local --algorithm ' + args.algorithm + ' -k ' + str(args.count) + \ ' --dataset ' + args.dataset - if args.host: base += ' --host ' + str(args.host) - if args.port: base += ' --port ' + str(args.port) - if args.user: base += ' --user ' + str(args.user) - if args.auth: base += ' --auth ' + str(args.auth) - if args.force: base += ' --force' - if args.cluster:base += ' --cluster' + if args.host: base += ' --host ' + str(args.host) + if args.port: base += ' --port ' + str(args.port) + if args.user: base += ' --user ' + str(args.user) + if args.auth: base += ' --auth ' + str(args.auth) + if args.force: base += ' --force' + if args.cluster: base += ' --cluster' + if args.run_group: base += ' --run-group ' + str(args.run_group) base_build = base + ' --build-only --total-clients ' + str(args.build_clients) base_test = base + ' --test-only --runs 1 --total-clients ' + str(args.test_clients) From a80dd70b267d4d18a0e379b9127a4385d8783c9f Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Mon, 14 Feb 2022 11:47:39 +0200 Subject: [PATCH 04/77] more updates for milvus algorithm --- algos.yaml | 127 +++++++++++++++------------- ann_benchmarks/algorithms/milvus.py | 25 ++---- 2 files changed, 74 insertions(+), 78 deletions(-) diff --git a/algos.yaml b/algos.yaml index 26bc1d3c9..1e48525a2 100644 --- a/algos.yaml +++ b/algos.yaml @@ -10,38 +10,38 @@ float: arg-groups: - {"M": 4, "efConstruction": 500} query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - # M-8: - # arg-groups: - # - {"M": 8, "efConstruction": 500} - # query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - # M-12: - # arg-groups: - # - {"M": 12, "efConstruction": 500} - # query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - # M-16: - # arg-groups: - # - {"M": 16, "efConstruction": 500} - # query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - # M-24: - # arg-groups: - # - {"M": 24, "efConstruction": 500} - # query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - # M-36: - # arg-groups: - # - {"M": 36, "efConstruction": 500} - # query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - # M-48: - # arg-groups: - # - {"M": 48, "efConstruction": 500} - # query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - # M-64: - # arg-groups: - # - {"M": 64, "efConstruction": 500} - # query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] - # M-96: - # arg-groups: - # - {"M": 96, "efConstruction": 500} - # query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-8: + arg-groups: + - {"M": 8, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-12: + arg-groups: + - {"M": 12, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-16: + arg-groups: + - {"M": 16, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-24: + arg-groups: + - {"M": 24, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-36: + arg-groups: + - {"M": 36, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-48: + arg-groups: + - {"M": 48, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-64: + arg-groups: + - {"M": 64, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-96: + arg-groups: + - {"M": 96, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] redisearch-flat: docker-tag: ann-benchmarks-redisearch module: ann_benchmarks.algorithms.redisearch @@ -51,30 +51,6 @@ float: BS-2^20: arg-groups: - {"BLOCK_SIZE": 1048576} - # M-8: - # arg-groups: - # - {"BLOCK_SIZE": 1048576} - # M-12: - # arg-groups: - # - {"BLOCK_SIZE": 1048576} - # M-16: - # arg-groups: - # - {"BLOCK_SIZE": 1048576} - # M-24: - # arg-groups: - # - {"BLOCK_SIZE": 1048576} - # M-36: - # arg-groups: - # - {"BLOCK_SIZE": 1048576} - # M-48: - # arg-groups: - # - {"BLOCK_SIZE": 1048576} - # M-64: - # arg-groups: - # - {"BLOCK_SIZE": 1048576} - # M-96: - # arg-groups: - # - {"BLOCK_SIZE": 1048576} sptag: docker-tag: ann-benchmarks-sptag module: ann_benchmarks.algorithms.sptag @@ -244,16 +220,45 @@ float: docker-tag: ann-benchmarks-milvus module: ann_benchmarks.algorithms.milvus constructor: Milvus - base-args: ["@metric", "@connection"] + base-args: ["@metric", "@dimension", "@connection", "HNSW"] run-groups: - milvus: - args: [['HNSW'], [{"M": 4, "efConstruction": 500}, {"M": 32, "efConstruction": 500}]] - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] # ef + M-4: + arg-groups: + - {"M": 4, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-8: + arg-groups: + - {"M": 8, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-12: + arg-groups: + - {"M": 12, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-16: + arg-groups: + - {"M": 16, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-24: + arg-groups: + - {"M": 24, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-36: + arg-groups: + - {"M": 36, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-48: + arg-groups: + - {"M": 48, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-64: + arg-groups: + - {"M": 64, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] milvus-ivf: docker-tag: ann-benchmarks-milvus module: ann_benchmarks.algorithms.milvus constructor: Milvus - base-args: ["@metric", "@connection"] + base-args: ["@metric", "@dimension", "@connection"] run-groups: milvus: args: [['IVF_FLAT', 'IVF_SQ8'], [{"nlist": 100}, {"nlist": 300}, {"nlist": 1000}, {"nlist": 3000}, {"nlist": 10000}, {"nlist": 30000}]] # nlist diff --git a/ann_benchmarks/algorithms/milvus.py b/ann_benchmarks/algorithms/milvus.py index d1194e738..82364f6a9 100644 --- a/ann_benchmarks/algorithms/milvus.py +++ b/ann_benchmarks/algorithms/milvus.py @@ -15,16 +15,16 @@ class Milvus(BaseANN): - def __init__(self, metric, conn_params, index_type, method_params): + def __init__(self, metric, dim, conn_params, index_type, method_params): self._host = conn_params['host'] self._port = conn_params['port'] # 19530 - # connections.connect(host=conn_params['host'], port=conn_params['port']) - # fields = [ - # FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False), - # FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=100) - # ] - # schema = CollectionSchema(fields) - # self._milvus = Collection('milvus', schema) + connections.connect(host=conn_params['host'], port=conn_params['port']) + fields = [ + FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False), + FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=dim) + ] + schema = CollectionSchema(fields) + self._milvus = Collection('milvus', schema) self._index_type = index_type self._method_params = method_params self._nprobe = None @@ -33,15 +33,6 @@ def __init__(self, metric, conn_params, index_type, method_params): def fit(self, X): if self._metric == 'angular': X = sklearn.preprocessing.normalize(X, axis=1) - - # TODO: if we can set the dim later, mabe return this to the init func - connections.connect(host=self._host, port=self._port) - fields = [ - FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False), - FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=len(X[0])) - ] - schema = CollectionSchema(fields) - self._milvus = Collection('milvus', schema) self._milvus.insert([[id for id in range(len(X))], X.tolist()]) self._milvus.create_index('vector', {'index_type': self._index_type, 'metric_type':'L2', 'params':self._method_params}) From eebac85b5a77aafae5946e2c9f39d46f60961f9d Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Mon, 14 Feb 2022 13:23:36 +0200 Subject: [PATCH 05/77] added support for multi-client build --- ann_benchmarks/algorithms/milvus.py | 23 ++++++++++++++--------- multirun.py | 4 ---- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/ann_benchmarks/algorithms/milvus.py b/ann_benchmarks/algorithms/milvus.py index 82364f6a9..2e866568e 100644 --- a/ann_benchmarks/algorithms/milvus.py +++ b/ann_benchmarks/algorithms/milvus.py @@ -19,22 +19,27 @@ def __init__(self, metric, dim, conn_params, index_type, method_params): self._host = conn_params['host'] self._port = conn_params['port'] # 19530 connections.connect(host=conn_params['host'], port=conn_params['port']) - fields = [ - FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False), - FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=dim) - ] - schema = CollectionSchema(fields) - self._milvus = Collection('milvus', schema) + try: + fields = [ + FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False), + FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=dim) + ] + schema = CollectionSchema(fields) + self._milvus = Collection('milvus', schema) + except: + self._milvus = Collection('milvus') self._index_type = index_type self._method_params = method_params self._nprobe = None self._metric = metric - def fit(self, X): + def fit(self, X, offset=0, limit=None): + limit = limit if limit else len(X) + X = X[offset:limit] if self._metric == 'angular': - X = sklearn.preprocessing.normalize(X, axis=1) + X = sklearn.preprocessing.normalize(X) - self._milvus.insert([[id for id in range(len(X))], X.tolist()]) + self._milvus.insert([[id for id in range(offset, limit)], X.tolist()]) self._milvus.create_index('vector', {'index_type': self._index_type, 'metric_type':'L2', 'params':self._method_params}) self._milvus.load() diff --git a/multirun.py b/multirun.py index c48561d1b..bad3bd905 100644 --- a/multirun.py +++ b/multirun.py @@ -77,10 +77,6 @@ if isredis: redis = RedisCluster if args.cluster else Redis redis = redis(host=args.host, port=args.port, password=args.auth, username=args.user) - elif 'milvus' in args.algorithm: - if args.build_clients > 1: - print('milvus does not allow multi client build. running with one builder') - args.build_clients = 1 base = 'python run.py --local --algorithm ' + args.algorithm + ' -k ' + str(args.count) + \ ' --dataset ' + args.dataset From f52e62397f827f70dce41a75fd7af41c4447ce36 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Wed, 16 Feb 2022 14:30:24 +0200 Subject: [PATCH 06/77] milvus.py improvement --- ann_benchmarks/algorithms/milvus.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/ann_benchmarks/algorithms/milvus.py b/ann_benchmarks/algorithms/milvus.py index 2e866568e..0d32e12c7 100644 --- a/ann_benchmarks/algorithms/milvus.py +++ b/ann_benchmarks/algorithms/milvus.py @@ -18,6 +18,10 @@ class Milvus(BaseANN): def __init__(self, metric, dim, conn_params, index_type, method_params): self._host = conn_params['host'] self._port = conn_params['port'] # 19530 + self._index_type = index_type + self._method_params = method_params + self._metric = metric + self._query_params = dict() connections.connect(host=conn_params['host'], port=conn_params['port']) try: fields = [ @@ -26,13 +30,10 @@ def __init__(self, metric, dim, conn_params, index_type, method_params): ] schema = CollectionSchema(fields) self._milvus = Collection('milvus', schema) + self._milvus.create_index('vector', {'index_type': self._index_type, 'metric_type':'L2', 'params':self._method_params}) except: self._milvus = Collection('milvus') - self._index_type = index_type - self._method_params = method_params - self._nprobe = None - self._metric = metric - + def fit(self, X, offset=0, limit=None): limit = limit if limit else len(X) X = X[offset:limit] @@ -40,11 +41,13 @@ def fit(self, X, offset=0, limit=None): X = sklearn.preprocessing.normalize(X) self._milvus.insert([[id for id in range(offset, limit)], X.tolist()]) - self._milvus.create_index('vector', {'index_type': self._index_type, 'metric_type':'L2', 'params':self._method_params}) - self._milvus.load() def set_query_arguments(self, param): - self._query_params = dict() + if self._milvus.has_index(): + if utility.wait_for_index_building_complete('milvus', 'vector'): + self._milvus.load() + else: raise Exception('index has error') + else: raise Exception('index is missing') if 'IVF_' in self._index_type: if param > self._method_params['nlist']: print('warning! nprobe > nlist') @@ -64,4 +67,4 @@ def query(self, v, n): return result_ids def __str__(self): - return 'Milvus(index_type=%s, method_params=%s, query_params=%s)' % (self._index_type, str(self._method_params), str(self._nprobe)) + return 'Milvus(index_type=%s, method_params=%s, query_params=%s)' % (self._index_type, str(self._method_params), str(self._query_params)) From 4b07c481db0bc6bd6ce9407acc426a0b0af8a499 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 24 Feb 2022 16:19:20 +0200 Subject: [PATCH 07/77] default values update --- ann_benchmarks/algorithms/redisearch.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index 318f2f44d..5faf86c89 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -14,16 +14,17 @@ def __init__(self, algo, metric, conn_params, method_param): self.index_name = "ann_benchmark" redis = RedisCluster if conn_params['cluster'] else Redis - self.redis = redis(host=conn_params["host"], port=conn_params["port"], - password=conn_params["auth"], username=conn_params["user"], - decode_responses=False) + host = conn_params["host"] if conn_params["host"] else 'localhost' + port = conn_params["port"] if conn_params["port"] else 6379 + self.redis = redis(host=host, port=port, decode_responses=False, + password=conn_params["auth"], username=conn_params["user"]) def fit(self, X, offset=0, limit=None): limit = limit if limit else len(X) try: # https://oss.redis.com/redisearch/master/Commands/#ftcreate if self.algo == "HNSW": - self.redis.execute_command('FT.CREATE', self.index_name, 'SCHEMA', 'vector', 'VECTOR', self.algo, '12', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', len(X), 'M', self.method_param['M'] , 'EF_CONSTRUCTION', self.method_param["efConstruction"], target_nodes='random') + self.redis.execute_command('FT.CREATE', self.index_name, 'SCHEMA', 'vector', 'VECTOR', self.algo, '12', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', len(X), 'M', self.method_param['M'], 'EF_CONSTRUCTION', self.method_param["efConstruction"], target_nodes='random') elif self.algo == "FLAT": self.redis.execute_command('FT.CREATE', self.index_name, 'SCHEMA', 'vector', 'VECTOR', self.algo, '10', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', len(X), 'BLOCK_SIZE', self.method_param['BLOCK_SIZE'], target_nodes='random') except Exception as e: From 25ff7ae241089454a77ee86eb0f0d4da95ddd22c Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Tue, 1 Mar 2022 15:02:31 +0200 Subject: [PATCH 08/77] rename TOP_K to KNN --- ann_benchmarks/algorithms/redisearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index 5faf86c89..9634baee8 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -40,7 +40,7 @@ def set_query_arguments(self, ef): def query(self, v, k): # https://oss.redis.com/redisearch/master/Commands/#ftsearch qparams = f' EF_RUNTIME {self.ef}' if self.algo == 'HNSW' else '' - vq = f'*=>[TOP_K {k} @vector $BLOB {qparams}]' + vq = f'*=>[KNN {k} @vector $BLOB {qparams}]' q = ['FT.SEARCH', self.index_name, vq, 'NOCONTENT', 'SORTBY', '__vector_score', 'LIMIT', '0', str(k), 'PARAMS', '2', 'BLOB', v.tobytes()] return [int(doc.replace(b'ann_',b'')) for doc in self.redis.execute_command(*q, target_nodes='random')[1:]] From 00f51b5bdccb91edc6d2be5d1c0f75c2dbf2c035 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Thu, 3 Mar 2022 15:27:31 +0200 Subject: [PATCH 09/77] moved from L2 to either L2 or IP --- ann_benchmarks/algorithms/milvus.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ann_benchmarks/algorithms/milvus.py b/ann_benchmarks/algorithms/milvus.py index 0d32e12c7..0438f573c 100644 --- a/ann_benchmarks/algorithms/milvus.py +++ b/ann_benchmarks/algorithms/milvus.py @@ -20,7 +20,7 @@ def __init__(self, metric, dim, conn_params, index_type, method_params): self._port = conn_params['port'] # 19530 self._index_type = index_type self._method_params = method_params - self._metric = metric + self.metric = {'angular': 'IP', 'euclidean': 'L2'}[metric] self._query_params = dict() connections.connect(host=conn_params['host'], port=conn_params['port']) try: @@ -30,14 +30,14 @@ def __init__(self, metric, dim, conn_params, index_type, method_params): ] schema = CollectionSchema(fields) self._milvus = Collection('milvus', schema) - self._milvus.create_index('vector', {'index_type': self._index_type, 'metric_type':'L2', 'params':self._method_params}) + self._milvus.create_index('vector', {'index_type': self._index_type, 'metric_type':self._metric, 'params':self._method_params}) except: self._milvus = Collection('milvus') def fit(self, X, offset=0, limit=None): limit = limit if limit else len(X) X = X[offset:limit] - if self._metric == 'angular': + if self._metric == 'IP': X = sklearn.preprocessing.normalize(X) self._milvus.insert([[id for id in range(offset, limit)], X.tolist()]) @@ -57,10 +57,10 @@ def set_query_arguments(self, param): self._query_params['ef'] = param def query(self, v, n): - if self._metric == 'angular': + if self._metric == 'IP': v /= numpy.linalg.norm(v) v = v.tolist() - results = self._milvus.search([v], 'vector', {'metric_type':'L2', 'params':self._query_params}, limit=n) + results = self._milvus.search([v], 'vector', {'metric_type':self._metric, 'params':self._query_params}, limit=n) if not results: return [] # Seems to happen occasionally, not sure why result_ids = [result.id for result in results[0]] From 05caf7e19d55a748496e873043df9c8ba844a66f Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Thu, 3 Mar 2022 15:49:37 +0200 Subject: [PATCH 10/77] added drop collection --- ann_benchmarks/algorithms/milvus.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ann_benchmarks/algorithms/milvus.py b/ann_benchmarks/algorithms/milvus.py index 0438f573c..b33511e2e 100644 --- a/ann_benchmarks/algorithms/milvus.py +++ b/ann_benchmarks/algorithms/milvus.py @@ -68,3 +68,6 @@ def query(self, v, n): def __str__(self): return 'Milvus(index_type=%s, method_params=%s, query_params=%s)' % (self._index_type, str(self._method_params), str(self._query_params)) + + def freeIndex(self): + utility.drop_collection("mlivus") From 253e5a8f773136de309fac8405b75f43f188cf19 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Thu, 3 Mar 2022 15:50:09 +0200 Subject: [PATCH 11/77] Update ann_benchmarks/algorithms/milvus.py Co-authored-by: GuyAv46 <47632673+GuyAv46@users.noreply.github.com> --- ann_benchmarks/algorithms/milvus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ann_benchmarks/algorithms/milvus.py b/ann_benchmarks/algorithms/milvus.py index b33511e2e..78e2c1ba9 100644 --- a/ann_benchmarks/algorithms/milvus.py +++ b/ann_benchmarks/algorithms/milvus.py @@ -20,7 +20,7 @@ def __init__(self, metric, dim, conn_params, index_type, method_params): self._port = conn_params['port'] # 19530 self._index_type = index_type self._method_params = method_params - self.metric = {'angular': 'IP', 'euclidean': 'L2'}[metric] + self._metric = {'angular': 'IP', 'euclidean': 'L2'}[metric] self._query_params = dict() connections.connect(host=conn_params['host'], port=conn_params['port']) try: From 6ae9bff1cb04f2b3d340f6671c3ae4c4c0076fb3 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Mon, 7 Mar 2022 00:40:31 +0000 Subject: [PATCH 12/77] Changes towards redisbench_admin integration. Workdir fix --- multirun.py | 64 +++++++++++++++++++++++++++++++++++++++-------------- run.py | 6 +++++ 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/multirun.py b/multirun.py index bad3bd905..de635f81c 100644 --- a/multirun.py +++ b/multirun.py @@ -2,10 +2,12 @@ from multiprocessing import Process import argparse import time +import json from redis import Redis from redis.cluster import RedisCluster import h5py -from ann_benchmarks.main import positive_int +import os +import pathlib from ann_benchmarks.results import get_result_filename if __name__ == "__main__": @@ -16,10 +18,15 @@ metavar='NAME', help='the dataset to load training points from', default='glove-100-angular') + parser.add_argument( + '--json-output', + help='Path to the output file. If defined will store the results in json format.', + default="" + ) parser.add_argument( "-k", "--count", - default=10, - type=positive_int, + default="10", + type=str, help="the number of near neighbours to search for") parser.add_argument( '--host', @@ -27,7 +34,7 @@ default=None) parser.add_argument( '--port', - type=positive_int, + type=str, help='the port "host" is listening on', default=None) parser.add_argument( @@ -42,16 +49,16 @@ default=None) parser.add_argument( '--build-clients', - type=int, + type=str, metavar='NUM', help='total number of clients running in parallel to build the index (could be 0)', - default=1) + default="1") parser.add_argument( '--test-clients', - type=int, + type=str, metavar='NUM', help='total number of clients running in parallel to test the index (could be 0)', - default=1) + default="1") parser.add_argument( '--force', help='re-run algorithms even if their results already exist', @@ -66,6 +73,12 @@ metavar='NAME', help='run only the named run group', default=None) + parser.add_argument( + '--runs', + type=str, + help='run each algorithm instance %(metavar)s times and use only' + ' the best result', + default="3") parser.add_argument( '--cluster', action='store_true', @@ -76,10 +89,11 @@ if isredis: redis = RedisCluster if args.cluster else Redis - redis = redis(host=args.host, port=args.port, password=args.auth, username=args.user) + redis = redis(host=args.host, port=int(args.port), password=args.auth, username=args.user) - base = 'python run.py --local --algorithm ' + args.algorithm + ' -k ' + str(args.count) + \ - ' --dataset ' + args.dataset + + base = 'python3 run.py --local --algorithm ' + args.algorithm + ' -k ' + args.count + \ + ' --dataset ' + args.dataset + " --runs {} ".format(args.runs) if args.host: base += ' --host ' + str(args.host) if args.port: base += ' --port ' + str(args.port) @@ -91,9 +105,12 @@ base_build = base + ' --build-only --total-clients ' + str(args.build_clients) base_test = base + ' --test-only --runs 1 --total-clients ' + str(args.test_clients) - - if args.build_clients > 0: - clients = [Process(target=system, args=(base_build + ' --client-id ' + str(i),)) for i in range(1, args.build_clients + 1)] + workdir = pathlib.Path(__file__).parent.absolute() + print("Changing the workdir to {}".format(workdir)) + os.chdir(workdir) + results_dict = {} + if int(args.build_clients) > 0: + clients = [Process(target=system, args=(base_build + ' --client-id ' + str(i),)) for i in range(1, int(args.build_clients) + 1)] t0 = time.time() for client in clients: client.start() @@ -102,23 +119,36 @@ print(f'total build time: {total_time}\n\n') fn = get_result_filename(args.dataset, args.count) + print(fn) fn = path.join(fn, args.algorithm) if not path.isdir(fn): makedirs(fn) fn = path.join(fn, 'build_stats.hdf5') f = h5py.File(fn, 'w') f.attrs["build_time"] = total_time + print(fn) + index_size = -1 if isredis: if args.cluster: - f.attrs["index_size"] = -1 # TODO: get total size from all the shards + index_size = -1 # TODO: get total size from all the shards else: - f.attrs["index_size"] = redis.ft('ann_benchmark').info()['vector_index_sz_mb']*0x100000 + index_size = redis.ft('ann_benchmark').info()['vector_index_sz_mb'] + f.attrs["index_size"] = index_size f.close() + results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } - if args.test_clients > 0: + + + if int(args.test_clients) > 0: queriers = [Process(target=system, args=(base_test + ' --client-id ' + str(i),)) for i in range(1, args.test_clients + 1)] t0 = time.time() for querier in queriers: querier.start() for querier in queriers: querier.join() query_time = time.time() - t0 print(f'total test time: {query_time}') + results_dict["query"] = {"total_clients":args.test_clients, "test_time": query_time } + + if args.json_output != "": + with open(args.json_output,"w")as json_out_file: + print(f'storing json result into: {args.json_output}') + json.dump(results_dict,json_out_file) diff --git a/run.py b/run.py index 8ca27bc2e..c0a094238 100644 --- a/run.py +++ b/run.py @@ -1,6 +1,12 @@ +import os +import pathlib + from ann_benchmarks.main import main from multiprocessing import freeze_support if __name__ == "__main__": + workdir = pathlib.Path(__file__).parent.absolute() + print("Changing the workdir to {}".format(workdir)) + os.chdir(workdir) freeze_support() main() From 579b25aabc8bc11f5d216ade6022de4f3c053129 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Mon, 7 Mar 2022 09:59:44 +0200 Subject: [PATCH 13/77] added yandex 1B subset dataset generator --- ann_benchmarks/create_text_to_image_ds.py | 117 ++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 ann_benchmarks/create_text_to_image_ds.py diff --git a/ann_benchmarks/create_text_to_image_ds.py b/ann_benchmarks/create_text_to_image_ds.py new file mode 100644 index 000000000..9efd10a6a --- /dev/null +++ b/ann_benchmarks/create_text_to_image_ds.py @@ -0,0 +1,117 @@ +from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS +import struct +import numpy as np +import click +import h5py +from joblib import Parallel, delayed +import multiprocessing + +def read_fbin(filename, start_idx=0, chunk_size=None): + """ Read *.fbin file that contains float32 vectors + Args: + :param filename (str): path to *.fbin file + :param start_idx (int): start reading vectors from this index + :param chunk_size (int): number of vectors to read. + If None, read all vectors + Returns: + Array of float32 vectors (numpy.ndarray) + """ + with open(filename, "rb") as f: + nvecs, dim = np.fromfile(f, count=2, dtype=np.int32) + nvecs = (nvecs - start_idx) if chunk_size is None else chunk_size + arr = np.fromfile(f, count=nvecs * dim, dtype=np.float32, + offset=start_idx * 4 * dim) + return arr.reshape(nvecs, dim) + + +def read_ibin(filename, start_idx=0, chunk_size=None): + """ Read *.ibin file that contains int32 vectors + Args: + :param filename (str): path to *.ibin file + :param start_idx (int): start reading vectors from this index + :param chunk_size (int): number of vectors to read. + If None, read all vectors + Returns: + Array of int32 vectors (numpy.ndarray) + """ + with open(filename, "rb") as f: + nvecs, dim = np.fromfile(f, count=2, dtype=np.int32) + nvecs = (nvecs - start_idx) if chunk_size is None else chunk_size + arr = np.fromfile(f, count=nvecs * dim, dtype=np.int32, + offset=start_idx * 4 * dim) + return arr.reshape(nvecs, dim) + + +def write_fbin(filename, vecs): + """ Write an array of float32 vectors to *.fbin file + Args:s + :param filename (str): path to *.fbin file + :param vecs (numpy.ndarray): array of float32 vectors to write + """ + assert len(vecs.shape) == 2, "Input array must have 2 dimensions" + with open(filename, "wb") as f: + nvecs, dim = vecs.shape + f.write(struct.pack(' Date: Mon, 7 Mar 2022 10:30:44 +0200 Subject: [PATCH 14/77] moved to main folder --- .../create_text_to_image_ds.py => create_text_to_image_ds.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ann_benchmarks/create_text_to_image_ds.py => create_text_to_image_ds.py (100%) diff --git a/ann_benchmarks/create_text_to_image_ds.py b/create_text_to_image_ds.py similarity index 100% rename from ann_benchmarks/create_text_to_image_ds.py rename to create_text_to_image_ds.py From d0d59173e7b79ddad1445d15250cb59347211ef3 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Mon, 7 Mar 2022 15:45:01 +0200 Subject: [PATCH 15/77] empty line --- create_text_to_image_ds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_text_to_image_ds.py b/create_text_to_image_ds.py index 9efd10a6a..c50339313 100644 --- a/create_text_to_image_ds.py +++ b/create_text_to_image_ds.py @@ -114,4 +114,4 @@ def create_ds(size, distance, test_set, train_set): write_output(train=train_set, test=test_set, fn=f'Text-to-Image-{size}M.hd5f', distance=distance, point_type='float', count=100) if __name__ == "__main__": - create_ds() \ No newline at end of file + create_ds() From 947c6a9368252b17ef26691ef5a45b472c33faea Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Tue, 8 Mar 2022 00:58:20 +0200 Subject: [PATCH 16/77] hybrid datasets generator --- create_hybrid_dataset.py | 75 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 create_hybrid_dataset.py diff --git a/create_hybrid_dataset.py b/create_hybrid_dataset.py new file mode 100644 index 000000000..eae44d855 --- /dev/null +++ b/create_hybrid_dataset.py @@ -0,0 +1,75 @@ +from operator import ne +import click +from ann_benchmarks.datasets import get_dataset, DATASETS +from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS +import struct +import numpy as np +import click +import h5py +from joblib import Parallel, delayed +import multiprocessing + +def calc_i(i, x, bf, test, neighbors, distances, count, orig_ids): + if i % 1000 == 0: + print('%d/%d...' % (i, len(test))) + res = list(bf.query_with_distances(x, count)) + res.sort(key=lambda t: t[-1]) + neighbors[i] = [orig_ids[j] for j, _ in res] + distances[i] = [d for _, d in res] + + +@click.command() +@click.option('--data_set', type=click.Choice(DATASETS.keys(), case_sensitive=False), default='glove-100-angular') +def create_ds(data_set): + ds, dimension= get_dataset(data_set) + bucket_0_5 = [] + bucket_1 = [] + bucket_2 = [] + bucket_5 = [] + bucket_10 = [] + bucket_20 = [] + buckets = [bucket_0_5, bucket_1, bucket_2, bucket_5, bucket_10, bucket_20] + bucket_names=['0.5%', '1%', '2%', '5%', '10%', '20%'] + train = ds['train'] + test = ds['test'] + distance = ds.attrs['distance'] + count=len(ds['neighbors'][0]) + print(count) + print(train.shape) + for i in range(train.shape[0]): + if i % 200 == 6: # 0.5% + bucket_0_5.append(i) + elif i % 100 == 4: # 1% + bucket_1.append(i) + elif i % 50 == 3: # 2% + bucket_2.append(i) + elif i % 20 == 2: # 5% + bucket_5.append(i) + elif i % 10 == 1: # 10% + bucket_10.append(i) + elif i % 5 == 0: # 20% + bucket_20.append(i) + print(len(bucket_0_5), len(bucket_1), len(bucket_2), len(bucket_5), len(bucket_10), len(bucket_20)) + for i, bucket in enumerate(buckets): + fn=f'{data_set}-hybrid_{bucket_names[i]}.hd5f' + with h5py.File(fn, 'w') as f: + f.attrs['type'] = 'dense' + f.attrs['distance'] = ds.attrs['distance'] + f.attrs['dimension'] = len(test[0]) + f.attrs['point_type'] = 'float' + f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train + f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test + neighbors = f.create_dataset(f'neighbors', (len(test), count), dtype='i') + distances = f.create_dataset(f'distances', (len(test), count), dtype='f') + train_bucket = np.array(bucket, dtype = np.int32) + train_set = train[bucket] + print(train_set.shape) + bf = BruteForceBLAS(distance, precision=train.dtype) + bf.fit(train_set) + Parallel(n_jobs=multiprocessing.cpu_count(), require='sharedmem')(delayed(calc_i)(i, x, bf, test, neighbors, distances, count, train_bucket) for i, x in enumerate(test)) + print(neighbors[0]) + print(distances[0]) + + +if __name__ == "__main__": + create_ds() \ No newline at end of file From d7ed6899d89c1ffbd06b6a9a7b05b7785b33d140 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Tue, 8 Mar 2022 01:07:58 +0200 Subject: [PATCH 17/77] write the id buckets to the hd5f file --- create_hybrid_dataset.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/create_hybrid_dataset.py b/create_hybrid_dataset.py index eae44d855..29b4ced19 100644 --- a/create_hybrid_dataset.py +++ b/create_hybrid_dataset.py @@ -57,10 +57,18 @@ def create_ds(data_set): f.attrs['distance'] = ds.attrs['distance'] f.attrs['dimension'] = len(test[0]) f.attrs['point_type'] = 'float' + f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test + # Write the id buckets so on ingestion we will know what data to assign for each id. + for j, id_bucket in enumerate(buckets): + np_bucket = np.array(id_bucket, dtype=np.int32) + f.create_dataset(f'{bucket_names[j]}_ids', np_bucket.shape, dtype=np_bucket.dtype)[:] = np_bucket + neighbors = f.create_dataset(f'neighbors', (len(test), count), dtype='i') distances = f.create_dataset(f'distances', (len(test), count), dtype='f') + + # Generate ground truth only for the relevan bucket. train_bucket = np.array(bucket, dtype = np.int32) train_set = train[bucket] print(train_set.shape) From 0c0d0735fb2ac119cbfc85bf483809496a9789cd Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Tue, 8 Mar 2022 01:09:07 +0200 Subject: [PATCH 18/77] empty line --- create_hybrid_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/create_hybrid_dataset.py b/create_hybrid_dataset.py index 29b4ced19..a2fa9e472 100644 --- a/create_hybrid_dataset.py +++ b/create_hybrid_dataset.py @@ -67,7 +67,7 @@ def create_ds(data_set): neighbors = f.create_dataset(f'neighbors', (len(test), count), dtype='i') distances = f.create_dataset(f'distances', (len(test), count), dtype='f') - + # Generate ground truth only for the relevan bucket. train_bucket = np.array(bucket, dtype = np.int32) train_set = train[bucket] @@ -80,4 +80,4 @@ def create_ds(data_set): if __name__ == "__main__": - create_ds() \ No newline at end of file + create_ds() From 1a8e1164ba42f9a1f2ba81f63cc5c6d78126357e Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Mon, 7 Mar 2022 23:21:22 +0000 Subject: [PATCH 19/77] Ensure workdir is used when creating build_stats results dir --- multirun.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/multirun.py b/multirun.py index de635f81c..37539c761 100644 --- a/multirun.py +++ b/multirun.py @@ -118,8 +118,7 @@ total_time = time.time() - t0 print(f'total build time: {total_time}\n\n') - fn = get_result_filename(args.dataset, args.count) - print(fn) + fn = "{}/{}".format(workdir, get_result_filename(args.dataset, args.count)) fn = path.join(fn, args.algorithm) if not path.isdir(fn): makedirs(fn) From f1d80d050940ef35eb3f83f305d955a2765ac847 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Tue, 8 Mar 2022 15:52:22 +0200 Subject: [PATCH 20/77] fix for passing number of runs --- multirun.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/multirun.py b/multirun.py index 37539c761..a334e1651 100644 --- a/multirun.py +++ b/multirun.py @@ -30,6 +30,7 @@ help="the number of near neighbours to search for") parser.add_argument( '--host', + type=str, help='host name or IP', default=None) parser.add_argument( @@ -39,11 +40,13 @@ default=None) parser.add_argument( '--auth', '-a', + type=str, metavar='PASS', help='password for connection', default=None) parser.add_argument( '--user', + type=str, metavar='NAME', help='user name for connection', default=None) @@ -70,6 +73,7 @@ default="redisearch-hnsw") parser.add_argument( '--run-group', + type=str, metavar='NAME', help='run only the named run group', default=None) @@ -87,24 +91,29 @@ args = parser.parse_args() isredis = True if 'redisearch' in args.algorithm else False + if args.host is None: + args.host = 'localhost' + if args.port is None: + if 'redisearch' in args.algorithm: args.port = 6379 + if 'milvus' in args.algorithm: args.port = 19530 + if isredis: redis = RedisCluster if args.cluster else Redis redis = redis(host=args.host, port=int(args.port), password=args.auth, username=args.user) - base = 'python3 run.py --local --algorithm ' + args.algorithm + ' -k ' + args.count + \ - ' --dataset ' + args.dataset + " --runs {} ".format(args.runs) + base = 'python3 run.py --local --algorithm ' + args.algorithm + ' -k ' + args.count + ' --dataset ' + args.dataset - if args.host: base += ' --host ' + str(args.host) - if args.port: base += ' --port ' + str(args.port) - if args.user: base += ' --user ' + str(args.user) - if args.auth: base += ' --auth ' + str(args.auth) + if args.host: base += ' --host ' + args.host + if args.port: base += ' --port ' + args.port + if args.user: base += ' --user ' + args.user + if args.auth: base += ' --auth ' + args.auth if args.force: base += ' --force' if args.cluster: base += ' --cluster' - if args.run_group: base += ' --run-group ' + str(args.run_group) + if args.run_group: base += ' --run-group ' + args.run_group - base_build = base + ' --build-only --total-clients ' + str(args.build_clients) - base_test = base + ' --test-only --runs 1 --total-clients ' + str(args.test_clients) + base_build = base + ' --build-only --total-clients ' + args.build_clients + base_test = base + ' --test-only --runs {} --total-clients {}'.format(args.runs, args.test_clients) workdir = pathlib.Path(__file__).parent.absolute() print("Changing the workdir to {}".format(workdir)) os.chdir(workdir) @@ -128,9 +137,7 @@ print(fn) index_size = -1 if isredis: - if args.cluster: - index_size = -1 # TODO: get total size from all the shards - else: + if not args.cluster: # TODO: get total size from all the shards index_size = redis.ft('ann_benchmark').info()['vector_index_sz_mb'] f.attrs["index_size"] = index_size f.close() From 5b9883787a40a4dfdd44fce83beac6304248a18c Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Wed, 9 Mar 2022 09:10:28 +0200 Subject: [PATCH 21/77] hybrid dataset generator. Redisearch hybrid load and run --- ann_benchmarks/algorithms/redisearch.py | 49 +++++++++++++++--- ann_benchmarks/datasets.py | 14 ++++- ann_benchmarks/runner.py | 27 ++++++++-- create_hybrid_dataset.py | 69 ++++++++++++++++--------- create_text_to_image_ds.py | 2 +- 5 files changed, 123 insertions(+), 38 deletions(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index 9634baee8..baca62306 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -1,4 +1,5 @@ from __future__ import absolute_import +from optparse import Values from redis import Redis from redis.cluster import RedisCluster from ann_benchmarks.constants import INDEX_DIR @@ -19,28 +20,62 @@ def __init__(self, algo, metric, conn_params, method_param): self.redis = redis(host=host, port=port, decode_responses=False, password=conn_params["auth"], username=conn_params["user"]) - def fit(self, X, offset=0, limit=None): + def fit(self, X, offset=0, limit=None, hybrid_buckets = None): limit = limit if limit else len(X) try: + args = [self.index_name, 'SCHEMA'] + if hybrid_buckets: + args.extend(['n', 'NUMERIC', 't', 'TEXT']) # https://oss.redis.com/redisearch/master/Commands/#ftcreate if self.algo == "HNSW": - self.redis.execute_command('FT.CREATE', self.index_name, 'SCHEMA', 'vector', 'VECTOR', self.algo, '12', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', len(X), 'M', self.method_param['M'], 'EF_CONSTRUCTION', self.method_param["efConstruction"], target_nodes='random') + args.extend(['vector', 'VECTOR', self.algo, '12', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', len(X), 'M', self.method_param['M'], 'EF_CONSTRUCTION', self.method_param["efConstruction"]]) elif self.algo == "FLAT": - self.redis.execute_command('FT.CREATE', self.index_name, 'SCHEMA', 'vector', 'VECTOR', self.algo, '10', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', len(X), 'BLOCK_SIZE', self.method_param['BLOCK_SIZE'], target_nodes='random') + args.extend(['vector', 'VECTOR', self.algo, '10', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', len(X), 'BLOCK_SIZE', self.method_param['BLOCK_SIZE']]) + self.redis.execute_command('FT.CREATE', *args, target_nodes='random') except Exception as e: if 'Index already exists' not in str(e): raise - - for i in range(offset, limit): - self.redis.execute_command('HSET', f'ann_{i}', 'vector', X[i].tobytes()) + p = self.redis.pipeline(transaction=False) + count = 0 + if hybrid_buckets: + print('running hybrid') + for bucket in hybrid_buckets.values(): + ids = bucket['ids'] + text = bucket['text'].decode() + number = bucket['number'] + print('calling HSET', f'ann_', 'vector', '', 't', text, 'n', number) + for id in ids: + if id >= offset and id < limit: + p.execute_command('HSET', f'ann_{id}', 'vector', X[id].tobytes(), 't', text, 'n', int(number)) + count+=1 + if count == 1000: + p.execute() + p.reset() + count = 0 + p.execute() + else: + for i in range(offset, limit): + p.execute_command('HSET', f'ann_{i}', 'vector', X[i].tobytes()) + count+=1 + if count == 1000: + p.execute() + p.reset() + count = 0 + p.execute() def set_query_arguments(self, ef): self.ef = ef + def set_hybrid_query(self, text): + self.text = text + def query(self, v, k): # https://oss.redis.com/redisearch/master/Commands/#ftsearch qparams = f' EF_RUNTIME {self.ef}' if self.algo == 'HNSW' else '' - vq = f'*=>[KNN {k} @vector $BLOB {qparams}]' + if self.text: + vq = f'(@t:{self.text})=>[KNN {k} @vector $BLOB {qparams}]' + else: + vq = f'*=>[KNN {k} @vector $BLOB {qparams}]' q = ['FT.SEARCH', self.index_name, vq, 'NOCONTENT', 'SORTBY', '__vector_score', 'LIMIT', '0', str(k), 'PARAMS', '2', 'BLOB', v.tobytes()] return [int(doc.replace(b'ann_',b'')) for doc in self.redis.execute_command(*q, target_nodes='random')[1:]] diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index f89069ebe..3d2598da3 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -7,6 +7,7 @@ from urllib.request import urlretrieve from ann_benchmarks.distance import dataset_transform +import urllib.parse def download(src, dst): @@ -25,7 +26,10 @@ def get_dataset_fn(dataset): def get_dataset(which): hdf5_fn = get_dataset_fn(which) try: - url = 'http://ann-benchmarks.com/%s.hdf5' % which + if 'hybrid' in which: + url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/hybrid_datasets/%s.hdf5' % urllib.parse.quote(which) + else: + url = 'http://ann-benchmarks.com/%s.hdf5' % which download(url, hdf5_fn) except: print("Cannot download %s" % url) @@ -40,7 +44,6 @@ def get_dataset(which): return hdf5_f, dimension - # Everything below this line is related to creating datasets # You probably never need to do this at home, # just rely on the prepared datasets at http://ann-benchmarks.com @@ -464,3 +467,10 @@ def lastfm(out_fn, n_dimensions, test_size=50000): out_fn, 'sift.hamming.256'), 'kosarak-jaccard': lambda out_fn: kosarak(out_fn), } + +hybrid_datasets = ['glove-200-angular', 'gist-960-euclidean', 'deep-image-96-angular'] +percentiles= ['0.5', '1', '2', '5', '10', '20', '50'] +for dataset in hybrid_datasets: + for percentile in percentiles: + DATASETS[f'{dataset}-hybrid_{percentile}'] = lambda fn: () + diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index 63418cb9f..35f73d8de 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -112,6 +112,17 @@ def run(definition, dataset, count, run_count, batch, build_only, test_only, num X_train, X_test = dataset_transform(D) + hybrid_buckets = None + if 'bucket_names' in D.attrs: + hybrid_buckets = {} + bucket_names = D.attrs['bucket_names'] + for bucket_name in bucket_names: + bucket_dict = {} + bucket_dict['ids'] = numpy.array(D[f'{bucket_name}_ids']) + bucket_dict['text'] = D[bucket_name]['text'][()] + bucket_dict['number'] = D[bucket_name]['number'][()] + hybrid_buckets[bucket_name] = bucket_dict + try: prepared_queries = False if hasattr(algo, "supports_prepared_queries"): @@ -120,15 +131,17 @@ def run(definition, dataset, count, run_count, batch, build_only, test_only, num if not test_only: per_client = len(X_train) // num_clients offset = per_client * (id - 1) - fit_args = [X_train] + fit_kwargs = {} if "offset" and "limit" in inspect.getfullargspec(algo.fit)[0]: - fit_args.append(offset) + fit_kwargs['offset']=offset if num_clients != id: - fit_args.append(offset + per_client) - + fit_kwargs['limit']=offset + per_client + if hybrid_buckets: + fit_kwargs['hybrid_buckets']=hybrid_buckets + t0 = time.time() memory_usage_before = algo.get_memory_usage() - algo.fit(*fit_args) + algo.fit(X_train, **fit_kwargs) build_time = time.time() - t0 index_size = algo.get_memory_usage() - memory_usage_before print('Built index in', build_time) @@ -147,6 +160,10 @@ def run(definition, dataset, count, run_count, batch, build_only, test_only, num (pos, len(query_argument_groups))) if query_arguments: algo.set_query_arguments(*query_arguments) + if hybrid_buckets: + text = hybrid_buckets[D.attrs['selected_bucket']]['text'].decode() + print("setting hybrid text query", text) + algo.set_hybrid_query(text) descriptor, results = run_individual_query( algo, X_train, X_test, distance, count, run_count, batch) if not test_only: diff --git a/create_hybrid_dataset.py b/create_hybrid_dataset.py index a2fa9e472..2f438299c 100644 --- a/create_hybrid_dataset.py +++ b/create_hybrid_dataset.py @@ -16,47 +16,70 @@ def calc_i(i, x, bf, test, neighbors, distances, count, orig_ids): res.sort(key=lambda t: t[-1]) neighbors[i] = [orig_ids[j] for j, _ in res] distances[i] = [d for _, d in res] - -@click.command() -@click.option('--data_set', type=click.Choice(DATASETS.keys(), case_sensitive=False), default='glove-100-angular') -def create_ds(data_set): - ds, dimension= get_dataset(data_set) +def create_buckets(train): bucket_0_5 = [] bucket_1 = [] bucket_2 = [] bucket_5 = [] bucket_10 = [] bucket_20 = [] - buckets = [bucket_0_5, bucket_1, bucket_2, bucket_5, bucket_10, bucket_20] - bucket_names=['0.5%', '1%', '2%', '5%', '10%', '20%'] - train = ds['train'] - test = ds['test'] - distance = ds.attrs['distance'] - count=len(ds['neighbors'][0]) - print(count) - print(train.shape) + bucket_50 = [] + other_bucket = [] + buckets = [bucket_0_5, bucket_1, bucket_2, bucket_5, bucket_10, bucket_20, bucket_50, other_bucket] + bucket_names=['0.5', '1', '2', '5', '10', '20', '50', 'other'] for i in range(train.shape[0]): - if i % 200 == 6: # 0.5% + if i % 200 == 19: # 0.5% bucket_0_5.append(i) - elif i % 100 == 4: # 1% + elif i % 100 == 17: # 1% bucket_1.append(i) - elif i % 50 == 3: # 2% + elif i % 50 == 9: # 2% bucket_2.append(i) - elif i % 20 == 2: # 5% + elif i % 20 == 7: # 5% bucket_5.append(i) - elif i % 10 == 1: # 10% + elif i % 10 == 3: # 10% bucket_10.append(i) - elif i % 5 == 0: # 20% + elif i % 2 == 0: # 50% + bucket_50.append(i) + elif i % 5 <= 1: # 20% bucket_20.append(i) - print(len(bucket_0_5), len(bucket_1), len(bucket_2), len(bucket_5), len(bucket_10), len(bucket_20)) + else: + other_bucket.append(i) + print(len(bucket_0_5), len(bucket_1), len(bucket_2), len(bucket_5), len(bucket_10), len(bucket_20), len(bucket_50), len(other_bucket)) + numeric_values = {} + text_values = {} + for i, bucket_name in enumerate(bucket_names): + numeric_values[bucket_name] = i + text_values[bucket_name] = f'text_{i}' + print(numeric_values) + print(text_values) + return buckets, bucket_names, numeric_values, text_values + +@click.command() +@click.option('--data_set', type=click.Choice(DATASETS.keys(), case_sensitive=False), default='glove-100-angular') +def create_ds(data_set): + ds, dimension= get_dataset(data_set) + train = ds['train'] + test = ds['test'] + distance = ds.attrs['distance'] + count=len(ds['neighbors'][0]) + print(count) + print(train.shape) + buckets, bucket_names, numeric_values, text_values = create_buckets(train) + for i, bucket in enumerate(buckets): - fn=f'{data_set}-hybrid_{bucket_names[i]}.hd5f' + fn=f'{data_set}-hybrid_{bucket_names[i]}.hdf5' with h5py.File(fn, 'w') as f: f.attrs['type'] = 'dense' f.attrs['distance'] = ds.attrs['distance'] f.attrs['dimension'] = len(test[0]) f.attrs['point_type'] = 'float' + f.attrs['bucket_names'] = bucket_names + f.attrs['selected_bucket'] = bucket_names[i] + for bucket_name in bucket_names: + grp = f.create_group(bucket_name) + grp["text"] = text_values[bucket_name] + grp["number"] = numeric_values[bucket_name] f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test @@ -75,8 +98,8 @@ def create_ds(data_set): bf = BruteForceBLAS(distance, precision=train.dtype) bf.fit(train_set) Parallel(n_jobs=multiprocessing.cpu_count(), require='sharedmem')(delayed(calc_i)(i, x, bf, test, neighbors, distances, count, train_bucket) for i, x in enumerate(test)) - print(neighbors[0]) - print(distances[0]) + print(neighbors[1]) + print(distances[1]) if __name__ == "__main__": diff --git a/create_text_to_image_ds.py b/create_text_to_image_ds.py index c50339313..3343e4d2f 100644 --- a/create_text_to_image_ds.py +++ b/create_text_to_image_ds.py @@ -111,7 +111,7 @@ def write_output(train, test, fn, distance, point_type='float', count=100): def create_ds(size, distance, test_set, train_set): test_set = read_fbin(test_set) train_set= read_fbin(train_set, chunk_size=size*1000000) - write_output(train=train_set, test=test_set, fn=f'Text-to-Image-{size}M.hd5f', distance=distance, point_type='float', count=100) + write_output(train=train_set, test=test_set, fn=f'Text-to-Image-{size}M.hdf5', distance=distance, point_type='float', count=100) if __name__ == "__main__": create_ds() From 766941bff2e281f6dbb2124a4174145d0e580231 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Wed, 9 Mar 2022 14:38:27 +0200 Subject: [PATCH 22/77] fixed dataset name. redisearch fixes --- ann_benchmarks/algorithms/redisearch.py | 14 ++++++-------- ann_benchmarks/datasets.py | 2 +- create_hybrid_dataset.py | 2 +- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index baca62306..51aa6454b 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -43,24 +43,22 @@ def fit(self, X, offset=0, limit=None, hybrid_buckets = None): ids = bucket['ids'] text = bucket['text'].decode() number = bucket['number'] - print('calling HSET', f'ann_', 'vector', '', 't', text, 'n', number) + print('calling HSET', f'', 'vector', '', 't', text, 'n', number) for id in ids: if id >= offset and id < limit: - p.execute_command('HSET', f'ann_{id}', 'vector', X[id].tobytes(), 't', text, 'n', int(number)) + p.execute_command('HSET', int(id), 'vector', X[id].tobytes(), 't', text, 'n', int(number)) count+=1 - if count == 1000: + if count % 1000 == 0: p.execute() p.reset() - count = 0 p.execute() else: for i in range(offset, limit): - p.execute_command('HSET', f'ann_{i}', 'vector', X[i].tobytes()) + p.execute_command('HSET', i, 'vector', X[i].tobytes()) count+=1 - if count == 1000: + if count % 1000 == 0: p.execute() p.reset() - count = 0 p.execute() def set_query_arguments(self, ef): @@ -77,7 +75,7 @@ def query(self, v, k): else: vq = f'*=>[KNN {k} @vector $BLOB {qparams}]' q = ['FT.SEARCH', self.index_name, vq, 'NOCONTENT', 'SORTBY', '__vector_score', 'LIMIT', '0', str(k), 'PARAMS', '2', 'BLOB', v.tobytes()] - return [int(doc.replace(b'ann_',b'')) for doc in self.redis.execute_command(*q, target_nodes='random')[1:]] + return [int(doc) for doc in self.redis.execute_command(*q, target_nodes='random')[1:]] def freeIndex(self): self.redis.execute_command("FLUSHALL") diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 3d2598da3..4cb79c644 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -472,5 +472,5 @@ def lastfm(out_fn, n_dimensions, test_size=50000): percentiles= ['0.5', '1', '2', '5', '10', '20', '50'] for dataset in hybrid_datasets: for percentile in percentiles: - DATASETS[f'{dataset}-hybrid_{percentile}'] = lambda fn: () + DATASETS[f'{dataset}-hybrid-{percentile}'] = lambda fn: () diff --git a/create_hybrid_dataset.py b/create_hybrid_dataset.py index 2f438299c..c22797f0c 100644 --- a/create_hybrid_dataset.py +++ b/create_hybrid_dataset.py @@ -68,7 +68,7 @@ def create_ds(data_set): buckets, bucket_names, numeric_values, text_values = create_buckets(train) for i, bucket in enumerate(buckets): - fn=f'{data_set}-hybrid_{bucket_names[i]}.hdf5' + fn=f'{data_set}-hybrid-{bucket_names[i]}.hdf5' with h5py.File(fn, 'w') as f: f.attrs['type'] = 'dense' f.attrs['distance'] = ds.attrs['distance'] From 0ba58d1b2a2d71b2a2ae8a097d6c9ecdc1dbe3b3 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Wed, 9 Mar 2022 17:51:05 +0200 Subject: [PATCH 23/77] testers clients now read build stats and add them to their output file --- ann_benchmarks/runner.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index 63418cb9f..a416e40ab 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -6,6 +6,7 @@ import time import traceback import inspect +import h5py import colors import docker @@ -16,7 +17,7 @@ instantiate_algorithm) from ann_benchmarks.datasets import get_dataset, DATASETS from ann_benchmarks.distance import metrics, dataset_transform -from ann_benchmarks.results import store_results +from ann_benchmarks.results import get_result_filename, store_results def run_individual_query(algo, X_train, X_test, distance, count, run_count, @@ -149,7 +150,14 @@ def run(definition, dataset, count, run_count, batch, build_only, test_only, num algo.set_query_arguments(*query_arguments) descriptor, results = run_individual_query( algo, X_train, X_test, distance, count, run_count, batch) - if not test_only: + if test_only: + fn = get_result_filename(dataset, count) + fn = os.path.join(fn, definition.algorithm, 'build_stats') + f = h5py.File(fn, 'r') + descriptor["build_time"] = f.attrs["build_time"] + descriptor["index_size"] = f.attrs["index_size"] + f.close() + else: descriptor["build_time"] = build_time descriptor["index_size"] = index_size descriptor["algo"] = definition.algorithm From 4f6e597e1abd659c5f33571e4c7b7e0732587ae0 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Wed, 9 Mar 2022 17:51:22 +0200 Subject: [PATCH 24/77] fixing types in multirun.py --- multirun.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/multirun.py b/multirun.py index a334e1651..41450459e 100644 --- a/multirun.py +++ b/multirun.py @@ -94,8 +94,8 @@ if args.host is None: args.host = 'localhost' if args.port is None: - if 'redisearch' in args.algorithm: args.port = 6379 - if 'milvus' in args.algorithm: args.port = 19530 + if 'redisearch' in args.algorithm: args.port = '6379' + if 'milvus' in args.algorithm: args.port = '19530' if isredis: redis = RedisCluster if args.cluster else Redis @@ -131,7 +131,7 @@ fn = path.join(fn, args.algorithm) if not path.isdir(fn): makedirs(fn) - fn = path.join(fn, 'build_stats.hdf5') + fn = path.join(fn, 'build_stats') f = h5py.File(fn, 'w') f.attrs["build_time"] = total_time print(fn) @@ -139,14 +139,12 @@ if isredis: if not args.cluster: # TODO: get total size from all the shards index_size = redis.ft('ann_benchmark').info()['vector_index_sz_mb'] - f.attrs["index_size"] = index_size + f.attrs["index_size"] = float(index_size) f.close() results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } - - if int(args.test_clients) > 0: - queriers = [Process(target=system, args=(base_test + ' --client-id ' + str(i),)) for i in range(1, args.test_clients + 1)] + queriers = [Process(target=system, args=(base_test + ' --client-id ' + str(i),)) for i in range(1, int(args.test_clients) + 1)] t0 = time.time() for querier in queriers: querier.start() for querier in queriers: querier.join() From e75dc7976071264f91035da47c19361a68a6d8ad Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 10 Mar 2022 18:06:55 +0200 Subject: [PATCH 25/77] added try..except --- ann_benchmarks/runner.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index a416e40ab..0c5550f51 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -151,12 +151,16 @@ def run(definition, dataset, count, run_count, batch, build_only, test_only, num descriptor, results = run_individual_query( algo, X_train, X_test, distance, count, run_count, batch) if test_only: - fn = get_result_filename(dataset, count) - fn = os.path.join(fn, definition.algorithm, 'build_stats') - f = h5py.File(fn, 'r') - descriptor["build_time"] = f.attrs["build_time"] - descriptor["index_size"] = f.attrs["index_size"] - f.close() + try: + fn = get_result_filename(dataset, count) + fn = os.path.join(fn, definition.algorithm, 'build_stats') + f = h5py.File(fn, 'r') + descriptor["build_time"] = f.attrs["build_time"] + descriptor["index_size"] = f.attrs["index_size"] + f.close() + except: + descriptor["build_time"] = 0 + descriptor["index_size"] = 0 else: descriptor["build_time"] = build_time descriptor["index_size"] = index_size From bb906f1bdd8f717155f2b2b30ca6a955a0262dfe Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Fri, 11 Mar 2022 10:54:45 +0000 Subject: [PATCH 26/77] Fixed redisearch query() on non hybrid runs --- ann_benchmarks/algorithms/redisearch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index 51aa6454b..f05b6e0df 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -13,6 +13,7 @@ def __init__(self, algo, metric, conn_params, method_param): self.algo = algo self.name = 'redisearch-%s (%s)' % (self.algo, self.method_param) self.index_name = "ann_benchmark" + self.text = None redis = RedisCluster if conn_params['cluster'] else Redis host = conn_params["host"] if conn_params["host"] else 'localhost' From fc9ae71e4ec2c3898f95a7e520440325d77c1d17 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Mon, 14 Mar 2022 19:47:44 +0200 Subject: [PATCH 27/77] aggregate clients --- multirun.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 8 deletions(-) diff --git a/multirun.py b/multirun.py index 41450459e..ad91508ec 100644 --- a/multirun.py +++ b/multirun.py @@ -1,15 +1,46 @@ -from os import system, path, makedirs from multiprocessing import Process import argparse import time import json +from numpy import average from redis import Redis from redis.cluster import RedisCluster import h5py import os +from watchdog.observers import Observer +from watchdog.events import PatternMatchingEventHandler import pathlib from ann_benchmarks.results import get_result_filename +def aggregate_outputs(files, clients): + different_attrs = set([f.split('client')[0] for f in files]) + assert len(different_attrs) * clients == len(files), "missing files!" + groups = [[f + f'client_{i}.hdf5' for i in range(1, clients + 1)] for f in different_attrs] + + for group in groups: + fn = group[0].split('client')[0][:-1] + '.hdf5' + f = h5py.File(fn, 'w') + + fs = [h5py.File(fi, 'r') for fi in group] + for k, v in fs[0].attrs.items(): + f.attrs[k] = v + f.attrs["best_search_time"] = average([fi.attrs["best_search_time"] for fi in fs]) + f.attrs["candidates"] = average([fi.attrs["candidates"] for fi in fs]) + + times = f.create_dataset('times', fs[0]['times'].shape, 'f') + neighbors = f.create_dataset('neighbors', fs[0]['neighbors'].shape, 'i') + distances = f.create_dataset('distances', fs[0]['distances'].shape, 'f') + num_tests = len(times) + + for i in range(num_tests): + neighbors[i] = [n for n in fs[0]['neighbors'][i]] + distances[i] = [n for n in fs[0]['distances'][i]] + times[i] = average([fi['times'][i] for fi in fs]) + + [fi.close() for fi in fs] + [os.remove(fi) for fi in group] + f.close() + if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -117,9 +148,13 @@ workdir = pathlib.Path(__file__).parent.absolute() print("Changing the workdir to {}".format(workdir)) os.chdir(workdir) + outputsdir = "{}/{}".format(workdir, get_result_filename(args.dataset, args.count)) + outputsdir = os.path.join(outputsdir, args.algorithm) + if not os.path.isdir(outputsdir): + os.makedirs(outputsdir) results_dict = {} if int(args.build_clients) > 0: - clients = [Process(target=system, args=(base_build + ' --client-id ' + str(i),)) for i in range(1, int(args.build_clients) + 1)] + clients = [Process(target=os.system, args=(base_build + ' --client-id ' + str(i),)) for i in range(1, int(args.build_clients) + 1)] t0 = time.time() for client in clients: client.start() @@ -127,11 +162,7 @@ total_time = time.time() - t0 print(f'total build time: {total_time}\n\n') - fn = "{}/{}".format(workdir, get_result_filename(args.dataset, args.count)) - fn = path.join(fn, args.algorithm) - if not path.isdir(fn): - makedirs(fn) - fn = path.join(fn, 'build_stats') + fn = os.path.join(outputsdir, 'build_stats') f = h5py.File(fn, 'w') f.attrs["build_time"] = total_time print(fn) @@ -144,13 +175,27 @@ results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } if int(args.test_clients) > 0: - queriers = [Process(target=system, args=(base_test + ' --client-id ' + str(i),)) for i in range(1, int(args.test_clients) + 1)] + queriers = [Process(target=os.system, args=(base_test + ' --client-id ' + str(i),)) for i in range(1, int(args.test_clients) + 1)] + test_stats = set() + watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True ) + def on_created_or_modified(event): + test_stats.add(event.src_path) + watcher.on_created = on_created_or_modified + watcher.on_modified = on_created_or_modified + observer = Observer() + observer.schedule(watcher, workdir, True) + observer.start() t0 = time.time() for querier in queriers: querier.start() for querier in queriers: querier.join() query_time = time.time() - t0 print(f'total test time: {query_time}') + observer.stop() + observer.join() results_dict["query"] = {"total_clients":args.test_clients, "test_time": query_time } + print(f'summarizing clients data ({len(test_stats)} files into {len(test_stats) // int(args.test_clients)})...') + aggregate_outputs(test_stats, int(args.test_clients)) + print('done!') if args.json_output != "": with open(args.json_output,"w")as json_out_file: From 1e9c684d84a64093246f5bed0c034126ac2e3c5b Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 17 Mar 2022 15:32:07 +0200 Subject: [PATCH 28/77] improved assertion log --- multirun.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/multirun.py b/multirun.py index ad91508ec..8ccc061e3 100644 --- a/multirun.py +++ b/multirun.py @@ -14,9 +14,23 @@ def aggregate_outputs(files, clients): different_attrs = set([f.split('client')[0] for f in files]) - assert len(different_attrs) * clients == len(files), "missing files!" groups = [[f + f'client_{i}.hdf5' for i in range(1, clients + 1)] for f in different_attrs] + if len(different_attrs) * clients > len(files): + print(f'missing files! got {len(files)} but expected {len(different_attrs) * clients}') + print('got files:') + [print('\t'+f) for f in files] + print('probably missing files:') + [[print('\t'+f) for f in g if f not in files] for g in groups] + assert False + elif len(different_attrs) * clients < len(files): + print(f'too many files! got {len(files)} but expected {len(different_attrs) * clients}') + print('got files:') + [print('\t'+f) for f in files] + print('probably unnecessary files:') + [print('\t'+f) for f in files if len([g for g in groups if f in g]) == 0] + raise False + for group in groups: fn = group[0].split('client')[0][:-1] + '.hdf5' f = h5py.File(fn, 'w') @@ -193,7 +207,7 @@ def on_created_or_modified(event): observer.stop() observer.join() results_dict["query"] = {"total_clients":args.test_clients, "test_time": query_time } - print(f'summarizing clients data ({len(test_stats)} files into {len(test_stats) // int(args.test_clients)})...') + print(f'summarizing {int(args.test_clients)} clients data ({len(test_stats)} files into {len(test_stats) // int(args.test_clients)})...') aggregate_outputs(test_stats, int(args.test_clients)) print('done!') From 7a5bc76c959b8269604204a36001f7a389c69553 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Mon, 21 Mar 2022 16:33:57 +0000 Subject: [PATCH 29/77] fix hybrid creation. added big ann --- ann_benchmarks/datasets.py | 8 +++++- create_hybrid_dataset.py | 53 +++++++++++++++++++++++++++++++++----- 2 files changed, 54 insertions(+), 7 deletions(-) diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 4cb79c644..ef00f0950 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -468,7 +468,13 @@ def lastfm(out_fn, n_dimensions, test_size=50000): 'kosarak-jaccard': lambda out_fn: kosarak(out_fn), } -hybrid_datasets = ['glove-200-angular', 'gist-960-euclidean', 'deep-image-96-angular'] + +big_ann_datasets = [f'Text-to-Image-{x}' for x in ['10M', '20M', '30M', '40M', '50M', '60M', '70M', '80M', '90M', '100M']] +for dataset in big_ann_datasets: + DATASETS[dataset] = lambda fn: () + + +hybrid_datasets = ['glove-200-angular', 'gist-960-euclidean', 'deep-image-96-angular'].extend(big_ann_datasets) percentiles= ['0.5', '1', '2', '5', '10', '20', '50'] for dataset in hybrid_datasets: for percentile in percentiles: diff --git a/create_hybrid_dataset.py b/create_hybrid_dataset.py index c22797f0c..1d389b12d 100644 --- a/create_hybrid_dataset.py +++ b/create_hybrid_dataset.py @@ -8,6 +8,7 @@ import h5py from joblib import Parallel, delayed import multiprocessing +import scipy.spatial def calc_i(i, x, bf, test, neighbors, distances, count, orig_ids): if i % 1000 == 0: @@ -57,7 +58,8 @@ def create_buckets(train): @click.command() @click.option('--data_set', type=click.Choice(DATASETS.keys(), case_sensitive=False), default='glove-100-angular') -def create_ds(data_set): +@click.option('--percentile', type=click.Choice(['0.5', '1', '2', '5', '10', '20', '50'], case_sensitive=False), default=None) +def create_ds(data_set, percentile): ds, dimension= get_dataset(data_set) train = ds['train'] test = ds['test'] @@ -67,7 +69,9 @@ def create_ds(data_set): print(train.shape) buckets, bucket_names, numeric_values, text_values = create_buckets(train) - for i, bucket in enumerate(buckets): + if percentile is not None: + i = ['0.5', '1', '2', '5', '10', '20', '50'].index(percentile) + bucket = buckets[i] fn=f'{data_set}-hybrid-{bucket_names[i]}.hdf5' with h5py.File(fn, 'w') as f: f.attrs['type'] = 'dense' @@ -84,6 +88,7 @@ def create_ds(data_set): f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test # Write the id buckets so on ingestion we will know what data to assign for each id. + for j, id_bucket in enumerate(buckets): np_bucket = np.array(id_bucket, dtype=np.int32) f.create_dataset(f'{bucket_names[j]}_ids', np_bucket.shape, dtype=np_bucket.dtype)[:] = np_bucket @@ -93,13 +98,49 @@ def create_ds(data_set): # Generate ground truth only for the relevan bucket. train_bucket = np.array(bucket, dtype = np.int32) - train_set = train[bucket] - print(train_set.shape) + train_set = np.empty((len(bucket), train.shape[1]), dtype=np.float32) + for id in range(len(bucket)): + train_set[id] = train[bucket[id]] bf = BruteForceBLAS(distance, precision=train.dtype) bf.fit(train_set) Parallel(n_jobs=multiprocessing.cpu_count(), require='sharedmem')(delayed(calc_i)(i, x, bf, test, neighbors, distances, count, train_bucket) for i, x in enumerate(test)) - print(neighbors[1]) - print(distances[1]) + + else: + for i, bucket in enumerate(buckets): + fn=f'{data_set}-hybrid-{bucket_names[i]}.hdf5' + with h5py.File(fn, 'w') as f: + f.attrs['type'] = 'dense' + f.attrs['distance'] = ds.attrs['distance'] + f.attrs['dimension'] = len(test[0]) + f.attrs['point_type'] = 'float' + f.attrs['bucket_names'] = bucket_names + f.attrs['selected_bucket'] = bucket_names[i] + for bucket_name in bucket_names: + grp = f.create_group(bucket_name) + grp["text"] = text_values[bucket_name] + grp["number"] = numeric_values[bucket_name] + + f.create_dataset('train', train.shape, dtype=train.dtype)[:] = train + f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test + # Write the id buckets so on ingestion we will know what data to assign for each id. + for j, id_bucket in enumerate(buckets): + np_bucket = np.array(id_bucket, dtype=np.int32) + f.create_dataset(f'{bucket_names[j]}_ids', np_bucket.shape, dtype=np_bucket.dtype)[:] = np_bucket + + neighbors = f.create_dataset(f'neighbors', (len(test), count), dtype='i') + distances = f.create_dataset(f'distances', (len(test), count), dtype='f') + + # Generate ground truth only for the relevan bucket. + train_bucket = np.array(bucket, dtype = np.int32) + train_set = np.empty((len(bucket), train.shape[1]), dtype=np.float32) + for id in range(len(bucket)): + train_set[id] = train[bucket[id]] + print(train_set.shape) + bf = BruteForceBLAS(distance, precision=train.dtype) + bf.fit(train_set) + Parallel(n_jobs=multiprocessing.cpu_count(), require='sharedmem')(delayed(calc_i)(i, x, bf, test, neighbors, distances, count, train_bucket) for i, x in enumerate(test)) + print(neighbors[1]) + print(distances[1]) if __name__ == "__main__": From ddeb8f8289d7cdfa9683397883727f208c48fdb8 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Mon, 21 Mar 2022 16:37:17 +0000 Subject: [PATCH 30/77] updated big ann bucket --- ann_benchmarks/datasets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index ef00f0950..14683f339 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -28,6 +28,8 @@ def get_dataset(which): try: if 'hybrid' in which: url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/hybrid_datasets/%s.hdf5' % urllib.parse.quote(which) + elif 'Text-to-Image' in which: + url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/big_ann/%s.hdf5' % urllib.parse.quote(which) else: url = 'http://ann-benchmarks.com/%s.hdf5' % which download(url, hdf5_fn) From b9a18977a08015bb27582fa7210e6d4e89cfb5d5 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Mon, 21 Mar 2022 16:41:46 +0000 Subject: [PATCH 31/77] fixed big ann hybrid datasets name --- ann_benchmarks/datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 14683f339..c94bf53f5 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -476,7 +476,8 @@ def lastfm(out_fn, n_dimensions, test_size=50000): DATASETS[dataset] = lambda fn: () -hybrid_datasets = ['glove-200-angular', 'gist-960-euclidean', 'deep-image-96-angular'].extend(big_ann_datasets) +hybrid_datasets = ['glove-200-angular', 'gist-960-euclidean', 'deep-image-96-angular'] +hybrid_datasets.extend(big_ann_datasets) percentiles= ['0.5', '1', '2', '5', '10', '20', '50'] for dataset in hybrid_datasets: for percentile in percentiles: From d2d91b7c9f35f857a6fff146ec8c14ddf364fb15 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Wed, 23 Mar 2022 00:26:32 +0200 Subject: [PATCH 32/77] fixed initial capacity on FT.Create --- ann_benchmarks/algorithms/redisearch.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index f05b6e0df..1505e0322 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -4,6 +4,7 @@ from redis.cluster import RedisCluster from ann_benchmarks.constants import INDEX_DIR from ann_benchmarks.algorithms.base import BaseANN +import math class RediSearch(BaseANN): @@ -20,6 +21,9 @@ def __init__(self, algo, metric, conn_params, method_param): port = conn_params["port"] if conn_params["port"] else 6379 self.redis = redis(host=host, port=port, decode_responses=False, password=conn_params["auth"], username=conn_params["user"]) + self.shards = 1 + if conn_params['cluster']: + self.shards = len(self.redis.get_primaries()) def fit(self, X, offset=0, limit=None, hybrid_buckets = None): limit = limit if limit else len(X) @@ -29,9 +33,10 @@ def fit(self, X, offset=0, limit=None, hybrid_buckets = None): args.extend(['n', 'NUMERIC', 't', 'TEXT']) # https://oss.redis.com/redisearch/master/Commands/#ftcreate if self.algo == "HNSW": - args.extend(['vector', 'VECTOR', self.algo, '12', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', len(X), 'M', self.method_param['M'], 'EF_CONSTRUCTION', self.method_param["efConstruction"]]) + args.extend(['vector', 'VECTOR', self.algo, '12', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', math.ceil(len(X)/self.shards), 'M', self.method_param['M'], 'EF_CONSTRUCTION', self.method_param["efConstruction"]]) elif self.algo == "FLAT": - args.extend(['vector', 'VECTOR', self.algo, '10', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', len(X), 'BLOCK_SIZE', self.method_param['BLOCK_SIZE']]) + args.extend(['vector', 'VECTOR', self.algo, '10', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', math.ceil(len(X)/self.shards), 'BLOCK_SIZE', self.method_param['BLOCK_SIZE']]) + print("Calling FT.CREATE", *args) self.redis.execute_command('FT.CREATE', *args, target_nodes='random') except Exception as e: if 'Index already exists' not in str(e): From e98a33787c71069992c1474757ba5a38bc77b1e5 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Wed, 23 Mar 2022 11:05:51 +0200 Subject: [PATCH 33/77] fixed race condition --- ann_benchmarks/datasets.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index c94bf53f5..6ce9a4e8a 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -19,7 +19,10 @@ def download(src, dst): def get_dataset_fn(dataset): if not os.path.exists('data'): - os.mkdir('data') + try: + os.mkdir('data') + except FileExistsError: + pass # fixes race condition return os.path.join('data', '%s.hdf5' % dataset) From a3bce919611b20a031368d359228632cb51579dc Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Thu, 24 Mar 2022 07:12:06 +0200 Subject: [PATCH 34/77] wip --- ann_benchmarks/algorithms/redisearch.py | 1 + multirun.py | 39 +++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index 1505e0322..1395529e2 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -26,6 +26,7 @@ def __init__(self, algo, metric, conn_params, method_param): self.shards = len(self.redis.get_primaries()) def fit(self, X, offset=0, limit=None, hybrid_buckets = None): + self.redis.execute_command("FLUSHALL") limit = limit if limit else len(X) try: args = [self.index_name, 'SCHEMA'] diff --git a/multirun.py b/multirun.py index 8ccc061e3..3100bb085 100644 --- a/multirun.py +++ b/multirun.py @@ -95,6 +95,12 @@ def aggregate_outputs(files, clients): metavar='NAME', help='user name for connection', default=None) + parser.add_argument( + '--full-flow-clients', + type=str, + metavar='NUM', + help='total number of clients running in parallel to execute a full flow (defaults to 0)', + default="0") parser.add_argument( '--build-clients', type=str, @@ -157,6 +163,7 @@ def aggregate_outputs(files, clients): if args.cluster: base += ' --cluster' if args.run_group: base += ' --run-group ' + args.run_group + base_flow = base + ' --runs {} --total-clients {}'.format(args.runs, args.full_flow_clients) base_build = base + ' --build-only --total-clients ' + args.build_clients base_test = base + ' --test-only --runs {} --total-clients {}'.format(args.runs, args.test_clients) workdir = pathlib.Path(__file__).parent.absolute() @@ -167,7 +174,33 @@ def aggregate_outputs(files, clients): if not os.path.isdir(outputsdir): os.makedirs(outputsdir) results_dict = {} - if int(args.build_clients) > 0: + flow_clients = int(args.full_flow_clients) + build_clients = int(args.build_clients) + test_clients = int(args.test_clients) + if flow_clients >- 0 : + queriers = [Process(target=os.system, args=(base_flow + ' --client-id ' + str(i),)) for i in range(1, int(args.full_flow_clients) + 1)] + test_stats = set() + watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True ) + def on_created_or_modified(event): + test_stats.add(event.src_path) + watcher.on_created = on_created_or_modified + watcher.on_modified = on_created_or_modified + observer = Observer() + observer.schedule(watcher, workdir, True) + observer.start() + t0 = time.time() + for querier in queriers: querier.start() + for querier in queriers: querier.join() + query_time = time.time() - t0 + print(f'total test time: {query_time}') + observer.stop() + observer.join() + results_dict["query"] = {"total_clients":args.full_flow_clients, "test_time": query_time } + print(f'summarizing {int(args.full_flow_clients)} clients data ({len(test_stats)} files into {len(test_stats) // int(args.full_flow_clients)})...') + aggregate_outputs(test_stats, int(args.full_flow_clients)) + print('done!') + + if build_clients > 0 and test_clients == 0: clients = [Process(target=os.system, args=(base_build + ' --client-id ' + str(i),)) for i in range(1, int(args.build_clients) + 1)] t0 = time.time() @@ -188,7 +221,7 @@ def aggregate_outputs(files, clients): f.close() results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } - if int(args.test_clients) > 0: + elif test_clients > 0 and build_clients == 0: queriers = [Process(target=os.system, args=(base_test + ' --client-id ' + str(i),)) for i in range(1, int(args.test_clients) + 1)] test_stats = set() watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True ) @@ -211,7 +244,7 @@ def on_created_or_modified(event): aggregate_outputs(test_stats, int(args.test_clients)) print('done!') - if args.json_output != "": + elif args.json_output != "": with open(args.json_output,"w")as json_out_file: print(f'storing json result into: {args.json_output}') json.dump(results_dict,json_out_file) From 43a3e21f2432de20120a5b335cf7981ec2719d16 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 24 Mar 2022 16:26:40 +0200 Subject: [PATCH 35/77] sirealized run-groups --- ann_benchmarks/algorithms/definitions.py | 12 +++ multirun.py | 106 +++++++++++++---------- 2 files changed, 73 insertions(+), 45 deletions(-) diff --git a/ann_benchmarks/algorithms/definitions.py b/ann_benchmarks/algorithms/definitions.py index b4aea1bf2..64f842474 100644 --- a/ann_benchmarks/algorithms/definitions.py +++ b/ann_benchmarks/algorithms/definitions.py @@ -96,6 +96,18 @@ def get_unique_algorithms(definition_file): return list(sorted(algos)) +def get_run_groups(definition_file, algo = None): + definitions = _get_definitions(definition_file) + run_groups = set() + for point in definitions: + for metric in definitions[point]: + for algorithm in definitions[point][metric]: + if algo == None or algo == algorithm: + for run_group in definitions[point][metric][algorithm]['run-groups'].keys(): + run_groups.add(run_group) + return list(sorted(run_groups)) + + def get_definitions(definition_file, dimension, point_type="float", distance_metric="euclidean", count=10, conn_params={'host': None, 'port': None, 'auth': None, 'user': None, 'cluster': False}): definitions = _get_definitions(definition_file) diff --git a/multirun.py b/multirun.py index 8ccc061e3..5506a9b52 100644 --- a/multirun.py +++ b/multirun.py @@ -11,6 +11,7 @@ from watchdog.events import PatternMatchingEventHandler import pathlib from ann_benchmarks.results import get_result_filename +from ann_benchmarks.algorithms.definitions import get_run_groups def aggregate_outputs(files, clients): different_attrs = set([f.split('client')[0] for f in files]) @@ -145,7 +146,11 @@ def aggregate_outputs(files, clients): if isredis: redis = RedisCluster if args.cluster else Redis redis = redis(host=args.host, port=int(args.port), password=args.auth, username=args.user) - + + if args.run_group is not None: + run_groups = [args.run_group] + else: + run_groups = get_run_groups('algos.yaml', args.algorithm) base = 'python3 run.py --local --algorithm ' + args.algorithm + ' -k ' + args.count + ' --dataset ' + args.dataset @@ -155,7 +160,6 @@ def aggregate_outputs(files, clients): if args.auth: base += ' --auth ' + args.auth if args.force: base += ' --force' if args.cluster: base += ' --cluster' - if args.run_group: base += ' --run-group ' + args.run_group base_build = base + ' --build-only --total-clients ' + args.build_clients base_test = base + ' --test-only --runs {} --total-clients {}'.format(args.runs, args.test_clients) @@ -166,52 +170,64 @@ def aggregate_outputs(files, clients): outputsdir = os.path.join(outputsdir, args.algorithm) if not os.path.isdir(outputsdir): os.makedirs(outputsdir) - results_dict = {} - if int(args.build_clients) > 0: - clients = [Process(target=os.system, args=(base_build + ' --client-id ' + str(i),)) for i in range(1, int(args.build_clients) + 1)] - - t0 = time.time() - for client in clients: client.start() - for client in clients: client.join() - total_time = time.time() - t0 - print(f'total build time: {total_time}\n\n') + results_dicts = [] + test_stats_files = set() + watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True ) + def on_created_or_modified(event): + test_stats_files.add(event.src_path) + watcher.on_created = on_created_or_modified + watcher.on_modified = on_created_or_modified + observer = Observer() + observer.schedule(watcher, workdir, True) + observer.start() - fn = os.path.join(outputsdir, 'build_stats') - f = h5py.File(fn, 'w') - f.attrs["build_time"] = total_time - print(fn) - index_size = -1 + for run_group in run_groups: if isredis: - if not args.cluster: # TODO: get total size from all the shards - index_size = redis.ft('ann_benchmark').info()['vector_index_sz_mb'] - f.attrs["index_size"] = float(index_size) - f.close() - results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } - - if int(args.test_clients) > 0: - queriers = [Process(target=os.system, args=(base_test + ' --client-id ' + str(i),)) for i in range(1, int(args.test_clients) + 1)] - test_stats = set() - watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True ) - def on_created_or_modified(event): - test_stats.add(event.src_path) - watcher.on_created = on_created_or_modified - watcher.on_modified = on_created_or_modified - observer = Observer() - observer.schedule(watcher, workdir, True) - observer.start() - t0 = time.time() - for querier in queriers: querier.start() - for querier in queriers: querier.join() - query_time = time.time() - t0 - print(f'total test time: {query_time}') - observer.stop() - observer.join() - results_dict["query"] = {"total_clients":args.test_clients, "test_time": query_time } - print(f'summarizing {int(args.test_clients)} clients data ({len(test_stats)} files into {len(test_stats) // int(args.test_clients)})...') - aggregate_outputs(test_stats, int(args.test_clients)) - print('done!') + redis.flushall() + + results_dict = {} + curr_base_build = base_build + ' --run-group ' + run_group + curr_base_test = base_test + ' --run-group ' + run_group + + if int(args.build_clients) > 0: + clients = [Process(target=os.system, args=(curr_base_build + ' --client-id ' + str(i),)) for i in range(1, int(args.build_clients) + 1)] + + t0 = time.time() + for client in clients: client.start() + for client in clients: client.join() + total_time = time.time() - t0 + print(f'total build time: {total_time}\n\n') + + fn = os.path.join(outputsdir, 'build_stats') + f = h5py.File(fn, 'w') + f.attrs["build_time"] = total_time + print(fn) + index_size = -1 + if isredis: + if not args.cluster: # TODO: get total size from all the shards + index_size = redis.ft('ann_benchmark').info()['vector_index_sz_mb'] + f.attrs["index_size"] = float(index_size) + f.close() + results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } + + if int(args.test_clients) > 0: + queriers = [Process(target=os.system, args=(curr_base_test + ' --client-id ' + str(i),)) for i in range(1, int(args.test_clients) + 1)] + t0 = time.time() + for querier in queriers: querier.start() + for querier in queriers: querier.join() + query_time = time.time() - t0 + print(f'total test time: {query_time}') + results_dict["query"] = {"total_clients":args.test_clients, "test_time": query_time } + + results_dicts.append(results_dict) + + observer.stop() + observer.join() + print(f'summarizing {int(args.test_clients)} clients data ({len(test_stats_files)} files into {len(test_stats_files) // int(args.test_clients)})...') + aggregate_outputs(test_stats_files, int(args.test_clients)) + print('done!') if args.json_output != "": - with open(args.json_output,"w")as json_out_file: + with open(args.json_output,"w") as json_out_file: print(f'storing json result into: {args.json_output}') json.dump(results_dict,json_out_file) From 93f1344f55fd8f33f6db12d6699d7b664e8c1052 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 24 Mar 2022 17:06:16 +0200 Subject: [PATCH 36/77] skips aggregate files when running with 1 client --- multirun.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/multirun.py b/multirun.py index 5506a9b52..9c1d1ddfd 100644 --- a/multirun.py +++ b/multirun.py @@ -171,15 +171,16 @@ def aggregate_outputs(files, clients): if not os.path.isdir(outputsdir): os.makedirs(outputsdir) results_dicts = [] - test_stats_files = set() - watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True ) - def on_created_or_modified(event): - test_stats_files.add(event.src_path) - watcher.on_created = on_created_or_modified - watcher.on_modified = on_created_or_modified - observer = Observer() - observer.schedule(watcher, workdir, True) - observer.start() + if int(args.test_clients) > 1: + test_stats_files = set() + watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True ) + def on_created_or_modified(event): + test_stats_files.add(event.src_path) + watcher.on_created = on_created_or_modified + watcher.on_modified = on_created_or_modified + observer = Observer() + observer.schedule(watcher, workdir, True) + observer.start() for run_group in run_groups: if isredis: @@ -221,11 +222,12 @@ def on_created_or_modified(event): results_dicts.append(results_dict) - observer.stop() - observer.join() - print(f'summarizing {int(args.test_clients)} clients data ({len(test_stats_files)} files into {len(test_stats_files) // int(args.test_clients)})...') - aggregate_outputs(test_stats_files, int(args.test_clients)) - print('done!') + if int(args.test_clients) > 1: + observer.stop() + observer.join() + print(f'summarizing {int(args.test_clients)} clients data ({len(test_stats_files)} files into {len(test_stats_files) // int(args.test_clients)})...') + aggregate_outputs(test_stats_files, int(args.test_clients)) + print('done!') if args.json_output != "": with open(args.json_output,"w") as json_out_file: From 19f82a52b2015cb4f96ca8ecf40208751dca5b0a Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 24 Mar 2022 17:06:59 +0200 Subject: [PATCH 37/77] added dialect 2 for redisreach --- ann_benchmarks/algorithms/redisearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index 1505e0322..f96dd7a8b 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -80,7 +80,7 @@ def query(self, v, k): vq = f'(@t:{self.text})=>[KNN {k} @vector $BLOB {qparams}]' else: vq = f'*=>[KNN {k} @vector $BLOB {qparams}]' - q = ['FT.SEARCH', self.index_name, vq, 'NOCONTENT', 'SORTBY', '__vector_score', 'LIMIT', '0', str(k), 'PARAMS', '2', 'BLOB', v.tobytes()] + q = ['FT.SEARCH', self.index_name, vq, 'NOCONTENT', 'SORTBY', '__vector_score', 'LIMIT', '0', str(k), 'PARAMS', '2', 'BLOB', v.tobytes(), 'DIALECT', '2'] return [int(doc) for doc in self.redis.execute_command(*q, target_nodes='random')[1:]] def freeIndex(self): From 1b9e3fbca4c980d63eb5f61af359ff0621138462 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 24 Mar 2022 18:57:22 +0200 Subject: [PATCH 38/77] added comments --- multirun.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/multirun.py b/multirun.py index 9c1d1ddfd..3b9b0ba5e 100644 --- a/multirun.py +++ b/multirun.py @@ -171,6 +171,8 @@ def aggregate_outputs(files, clients): if not os.path.isdir(outputsdir): os.makedirs(outputsdir) results_dicts = [] + + # skipping aggregation if using one tester if int(args.test_clients) > 1: test_stats_files = set() watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True ) @@ -222,6 +224,7 @@ def on_created_or_modified(event): results_dicts.append(results_dict) + # skipping aggregation if using one tester if int(args.test_clients) > 1: observer.stop() observer.join() From a6f8345336f752a0d78f4dcb28e64aedb10b6be8 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Fri, 25 Mar 2022 08:06:56 +0200 Subject: [PATCH 39/77] In multirun change to the proper workdir asap --- multirun.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/multirun.py b/multirun.py index 3b9b0ba5e..ba482d013 100644 --- a/multirun.py +++ b/multirun.py @@ -135,6 +135,13 @@ def aggregate_outputs(files, clients): help='working with a cluster') args = parser.parse_args() + + # we should change to the proper workdir as soon we parse the args + # given some functions bellow require on relative path to the project + workdir = pathlib.Path(__file__).parent.absolute() + print("Changing the workdir to {}".format(workdir)) + os.chdir(workdir) + isredis = True if 'redisearch' in args.algorithm else False if args.host is None: @@ -163,9 +170,6 @@ def aggregate_outputs(files, clients): base_build = base + ' --build-only --total-clients ' + args.build_clients base_test = base + ' --test-only --runs {} --total-clients {}'.format(args.runs, args.test_clients) - workdir = pathlib.Path(__file__).parent.absolute() - print("Changing the workdir to {}".format(workdir)) - os.chdir(workdir) outputsdir = "{}/{}".format(workdir, get_result_filename(args.dataset, args.count)) outputsdir = os.path.join(outputsdir, args.algorithm) if not os.path.isdir(outputsdir): From e1a4d38991a874f30e4dde79ddb0d1775b65510c Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Fri, 25 Mar 2022 09:34:37 +0300 Subject: [PATCH 40/77] report memory in kb --- multirun.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multirun.py b/multirun.py index ba482d013..f4a09d555 100644 --- a/multirun.py +++ b/multirun.py @@ -212,7 +212,7 @@ def on_created_or_modified(event): index_size = -1 if isredis: if not args.cluster: # TODO: get total size from all the shards - index_size = redis.ft('ann_benchmark').info()['vector_index_sz_mb'] + index_size = redis.ft('ann_benchmark').info()['vector_index_sz_mb']*1000 f.attrs["index_size"] = float(index_size) f.close() results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } From ab08e398cfefa85fff7d3558ec1eabd0a43a24e1 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Fri, 25 Mar 2022 08:55:59 +0200 Subject: [PATCH 41/77] fix float conversion of vector_index_sz_mb before multiplying --- multirun.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/multirun.py b/multirun.py index f4a09d555..a147984c1 100644 --- a/multirun.py +++ b/multirun.py @@ -212,8 +212,8 @@ def on_created_or_modified(event): index_size = -1 if isredis: if not args.cluster: # TODO: get total size from all the shards - index_size = redis.ft('ann_benchmark').info()['vector_index_sz_mb']*1000 - f.attrs["index_size"] = float(index_size) + index_size = float(redis.ft('ann_benchmark').info()['vector_index_sz_mb'])*1000 + f.attrs["index_size"] = index_size f.close() results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } From 0ce7a5ac85626f0249603ab1c180e251275be846 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Fri, 25 Mar 2022 09:07:20 +0200 Subject: [PATCH 42/77] Fixes per PR review --- multirun.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multirun.py b/multirun.py index a147984c1..3a23e086f 100644 --- a/multirun.py +++ b/multirun.py @@ -212,7 +212,7 @@ def on_created_or_modified(event): index_size = -1 if isredis: if not args.cluster: # TODO: get total size from all the shards - index_size = float(redis.ft('ann_benchmark').info()['vector_index_sz_mb'])*1000 + index_size = float(redis.ft('ann_benchmark').info()['vector_index_sz_mb'])*1024 f.attrs["index_size"] = index_size f.close() results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } From 1b7f86293b8eb85b4f5820de8c5825ec3c7f0012 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Fri, 25 Mar 2022 09:59:10 +0200 Subject: [PATCH 43/77] Revert "wip" This reverts commit a3bce919611b20a031368d359228632cb51579dc. --- ann_benchmarks/algorithms/redisearch.py | 1 - multirun.py | 39 ++----------------------- 2 files changed, 3 insertions(+), 37 deletions(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index c0f0b5744..f96dd7a8b 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -26,7 +26,6 @@ def __init__(self, algo, metric, conn_params, method_param): self.shards = len(self.redis.get_primaries()) def fit(self, X, offset=0, limit=None, hybrid_buckets = None): - self.redis.execute_command("FLUSHALL") limit = limit if limit else len(X) try: args = [self.index_name, 'SCHEMA'] diff --git a/multirun.py b/multirun.py index a6235b8d1..680a5053c 100644 --- a/multirun.py +++ b/multirun.py @@ -96,12 +96,6 @@ def aggregate_outputs(files, clients): metavar='NAME', help='user name for connection', default=None) - parser.add_argument( - '--full-flow-clients', - type=str, - metavar='NUM', - help='total number of clients running in parallel to execute a full flow (defaults to 0)', - default="0") parser.add_argument( '--build-clients', type=str, @@ -174,7 +168,6 @@ def aggregate_outputs(files, clients): if args.force: base += ' --force' if args.cluster: base += ' --cluster' - base_flow = base + ' --runs {} --total-clients {}'.format(args.runs, args.full_flow_clients) base_build = base + ' --build-only --total-clients ' + args.build_clients base_test = base + ' --test-only --runs {} --total-clients {}'.format(args.runs, args.test_clients) outputsdir = "{}/{}".format(workdir, get_result_filename(args.dataset, args.count)) @@ -182,33 +175,7 @@ def aggregate_outputs(files, clients): if not os.path.isdir(outputsdir): os.makedirs(outputsdir) results_dict = {} - flow_clients = int(args.full_flow_clients) - build_clients = int(args.build_clients) - test_clients = int(args.test_clients) - if flow_clients >- 0 : - queriers = [Process(target=os.system, args=(base_flow + ' --client-id ' + str(i),)) for i in range(1, int(args.full_flow_clients) + 1)] - test_stats = set() - watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True ) - def on_created_or_modified(event): - test_stats.add(event.src_path) - watcher.on_created = on_created_or_modified - watcher.on_modified = on_created_or_modified - observer = Observer() - observer.schedule(watcher, workdir, True) - observer.start() - t0 = time.time() - for querier in queriers: querier.start() - for querier in queriers: querier.join() - query_time = time.time() - t0 - print(f'total test time: {query_time}') - observer.stop() - observer.join() - results_dict["query"] = {"total_clients":args.full_flow_clients, "test_time": query_time } - print(f'summarizing {int(args.full_flow_clients)} clients data ({len(test_stats)} files into {len(test_stats) // int(args.full_flow_clients)})...') - aggregate_outputs(test_stats, int(args.full_flow_clients)) - print('done!') - - if build_clients > 0 and test_clients == 0: + if int(args.build_clients) > 0: clients = [Process(target=os.system, args=(base_build + ' --client-id ' + str(i),)) for i in range(1, int(args.build_clients) + 1)] t0 = time.time() @@ -229,7 +196,7 @@ def on_created_or_modified(event): f.close() results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } - elif test_clients > 0 and build_clients == 0: + if int(args.test_clients) > 0: queriers = [Process(target=os.system, args=(base_test + ' --client-id ' + str(i),)) for i in range(1, int(args.test_clients) + 1)] test_stats = set() watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True ) @@ -289,7 +256,7 @@ def on_created_or_modified(event): aggregate_outputs(test_stats_files, int(args.test_clients)) print('done!') - elif args.json_output != "": + if args.json_output != "": with open(args.json_output,"w")as json_out_file: print(f'storing json result into: {args.json_output}') json.dump(results_dict,json_out_file) From 55d4575a8cf6a4dc2246998a0a59b2ea5803cee3 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Fri, 25 Mar 2022 16:13:27 +0200 Subject: [PATCH 44/77] Revert 'wip' --- multirun.py | 74 +++++++++++++++++++++++------------------------------ 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/multirun.py b/multirun.py index 680a5053c..f005237ea 100644 --- a/multirun.py +++ b/multirun.py @@ -13,6 +13,7 @@ from ann_benchmarks.results import get_result_filename from ann_benchmarks.algorithms.definitions import get_run_groups + def aggregate_outputs(files, clients): different_attrs = set([f.split('client')[0] for f in files]) groups = [[f + f'client_{i}.hdf5' for i in range(1, clients + 1)] for f in different_attrs] @@ -20,42 +21,43 @@ def aggregate_outputs(files, clients): if len(different_attrs) * clients > len(files): print(f'missing files! got {len(files)} but expected {len(different_attrs) * clients}') print('got files:') - [print('\t'+f) for f in files] + [print('\t' + f) for f in files] print('probably missing files:') - [[print('\t'+f) for f in g if f not in files] for g in groups] + [[print('\t' + f) for f in g if f not in files] for g in groups] assert False elif len(different_attrs) * clients < len(files): print(f'too many files! got {len(files)} but expected {len(different_attrs) * clients}') print('got files:') - [print('\t'+f) for f in files] + [print('\t' + f) for f in files] print('probably unnecessary files:') - [print('\t'+f) for f in files if len([g for g in groups if f in g]) == 0] + [print('\t' + f) for f in files if len([g for g in groups if f in g]) == 0] raise False - + for group in groups: fn = group[0].split('client')[0][:-1] + '.hdf5' f = h5py.File(fn, 'w') - + fs = [h5py.File(fi, 'r') for fi in group] for k, v in fs[0].attrs.items(): f.attrs[k] = v f.attrs["best_search_time"] = average([fi.attrs["best_search_time"] for fi in fs]) f.attrs["candidates"] = average([fi.attrs["candidates"] for fi in fs]) - + times = f.create_dataset('times', fs[0]['times'].shape, 'f') neighbors = f.create_dataset('neighbors', fs[0]['neighbors'].shape, 'i') distances = f.create_dataset('distances', fs[0]['distances'].shape, 'f') num_tests = len(times) - + for i in range(num_tests): neighbors[i] = [n for n in fs[0]['neighbors'][i]] distances[i] = [n for n in fs[0]['distances'][i]] times[i] = average([fi['times'][i] for fi in fs]) - + [fi.close() for fi in fs] [os.remove(fi) for fi in group] f.close() + if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -153,7 +155,7 @@ def aggregate_outputs(files, clients): if isredis: redis = RedisCluster if args.cluster else Redis redis = redis(host=args.host, port=int(args.port), password=args.auth, username=args.user) - + if args.run_group is not None: run_groups = [args.run_group] else: @@ -174,34 +176,18 @@ def aggregate_outputs(files, clients): outputsdir = os.path.join(outputsdir, args.algorithm) if not os.path.isdir(outputsdir): os.makedirs(outputsdir) - results_dict = {} - if int(args.build_clients) > 0: - clients = [Process(target=os.system, args=(base_build + ' --client-id ' + str(i),)) for i in range(1, int(args.build_clients) + 1)] + results_dicts = [] - t0 = time.time() - for client in clients: client.start() - for client in clients: client.join() - total_time = time.time() - t0 - print(f'total build time: {total_time}\n\n') + # skipping aggregation if using one tester + if int(args.test_clients) > 1: + test_stats_files = set() + watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True) - fn = os.path.join(outputsdir, 'build_stats') - f = h5py.File(fn, 'w') - f.attrs["build_time"] = total_time - print(fn) - index_size = -1 - if isredis: - if not args.cluster: # TODO: get total size from all the shards - index_size = redis.ft('ann_benchmark').info()['vector_index_sz_mb'] - f.attrs["index_size"] = float(index_size) - f.close() - results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } - if int(args.test_clients) > 0: - queriers = [Process(target=os.system, args=(base_test + ' --client-id ' + str(i),)) for i in range(1, int(args.test_clients) + 1)] - test_stats = set() - watcher = PatternMatchingEventHandler(["*.hdf5"], ignore_directories=True ) def on_created_or_modified(event): test_stats_files.add(event.src_path) + + watcher.on_created = on_created_or_modified watcher.on_modified = on_created_or_modified observer = Observer() @@ -217,7 +203,8 @@ def on_created_or_modified(event): curr_base_test = base_test + ' --run-group ' + run_group if int(args.build_clients) > 0: - clients = [Process(target=os.system, args=(curr_base_build + ' --client-id ' + str(i),)) for i in range(1, int(args.build_clients) + 1)] + clients = [Process(target=os.system, args=(curr_base_build + ' --client-id ' + str(i),)) for i in + range(1, int(args.build_clients) + 1)] t0 = time.time() for client in clients: client.start() @@ -231,20 +218,22 @@ def on_created_or_modified(event): print(fn) index_size = -1 if isredis: - if not args.cluster: # TODO: get total size from all the shards - index_size = float(redis.ft('ann_benchmark').info()['vector_index_sz_mb'])*1024 + if not args.cluster: # TODO: get total size from all the shards + index_size = float(redis.ft('ann_benchmark').info()['vector_index_sz_mb']) * 1024 f.attrs["index_size"] = index_size f.close() - results_dict["build"] = {"total_clients":args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size } + results_dict["build"] = {"total_clients": args.build_clients, "build_time": total_time, + "vector_index_sz_mb": index_size} if int(args.test_clients) > 0: - queriers = [Process(target=os.system, args=(curr_base_test + ' --client-id ' + str(i),)) for i in range(1, int(args.test_clients) + 1)] + queriers = [Process(target=os.system, args=(curr_base_test + ' --client-id ' + str(i),)) for i in + range(1, int(args.test_clients) + 1)] t0 = time.time() for querier in queriers: querier.start() for querier in queriers: querier.join() query_time = time.time() - t0 print(f'total test time: {query_time}') - results_dict["query"] = {"total_clients":args.test_clients, "test_time": query_time } + results_dict["query"] = {"total_clients": args.test_clients, "test_time": query_time} results_dicts.append(results_dict) @@ -252,11 +241,12 @@ def on_created_or_modified(event): if int(args.test_clients) > 1: observer.stop() observer.join() - print(f'summarizing {int(args.test_clients)} clients data ({len(test_stats_files)} files into {len(test_stats_files) // int(args.test_clients)})...') + print( + f'summarizing {int(args.test_clients)} clients data ({len(test_stats_files)} files into {len(test_stats_files) // int(args.test_clients)})...') aggregate_outputs(test_stats_files, int(args.test_clients)) print('done!') if args.json_output != "": - with open(args.json_output,"w")as json_out_file: + with open(args.json_output, "w") as json_out_file: print(f'storing json result into: {args.json_output}') - json.dump(results_dict,json_out_file) + json.dump(results_dict, json_out_file) From 79aeedc7e7ff550dac5cdac3387dd6a18d3932a4 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Fri, 25 Mar 2022 18:14:02 +0300 Subject: [PATCH 45/77] changed watcher to watch results dir --- multirun.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/multirun.py b/multirun.py index f005237ea..5795b9774 100644 --- a/multirun.py +++ b/multirun.py @@ -191,7 +191,8 @@ def on_created_or_modified(event): watcher.on_created = on_created_or_modified watcher.on_modified = on_created_or_modified observer = Observer() - observer.schedule(watcher, workdir, True) + results_dir = workdir / 'results' + observer.schedule(watcher, results_dir, True) observer.start() for run_group in run_groups: From 423ad071670f73fa1ae1020066eeb2fc4ec35aa7 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Fri, 25 Mar 2022 18:34:38 +0300 Subject: [PATCH 46/77] Update multirun.py Co-authored-by: GuyAv46 <47632673+GuyAv46@users.noreply.github.com> --- multirun.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/multirun.py b/multirun.py index 5795b9774..77c85c275 100644 --- a/multirun.py +++ b/multirun.py @@ -191,8 +191,7 @@ def on_created_or_modified(event): watcher.on_created = on_created_or_modified watcher.on_modified = on_created_or_modified observer = Observer() - results_dir = workdir / 'results' - observer.schedule(watcher, results_dir, True) + observer.schedule(watcher, outputsdir) observer.start() for run_group in run_groups: From 495bbbafc7977741ac9083071abce5c3ed1ef918 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Mon, 2 May 2022 11:28:00 +0300 Subject: [PATCH 47/77] dbpedia --- ann_benchmarks/datasets.py | 106 ++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 6ce9a4e8a..f20c8f183 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -29,7 +29,9 @@ def get_dataset_fn(dataset): def get_dataset(which): hdf5_fn = get_dataset_fn(which) try: - if 'hybrid' in which: + if 'dbpedia' in which: + url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/dbpedia/dbpedia-768.hdf5' + elif 'hybrid' in which: url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/hybrid_datasets/%s.hdf5' % urllib.parse.quote(which) elif 'Text-to-Image' in which: url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/big_ann/%s.hdf5' % urllib.parse.quote(which) @@ -433,6 +435,103 @@ def lastfm(out_fn, n_dimensions, test_size=50000): # as the inner product on the untransformed data write_output(item_factors, user_factors, out_fn, 'angular') +def parse_dbpedia_data(source_file, max_docs: int): + import re + """ + Parses the input file of abstracts and returns an iterable + :param max_docs: maximum number of input documents to process; -1 for no limit + :param source_file: input file + :return: yields document by document to the consumer + """ + global VERBOSE + count = 0 + max_tokens = 0 + + if -1 < max_docs < 50: + VERBOSE = True + + percent = 0.1 + bulk_size = (percent / 100) * max_docs + + print(f"bulk_size={bulk_size}") + + if bulk_size <= 0: + bulk_size = 1000 + + for line in source_file: + line = line.decode("utf-8") + + # skip commented out lines + comment_regex = '^#' + if re.search(comment_regex, line): + continue + + token_size = len(line.split()) + if token_size > max_tokens: + max_tokens = token_size + + # skip lines with 20 tokens or less, because they tend to contain noise + # (this may vary in your dataset) + if token_size <= 20: + continue + + first_url_regex = '^<([^\>]+)>\s*' + + x = re.search(first_url_regex, line) + if x: + url = x.group(1) + # also remove the url from the string + line = re.sub(first_url_regex, '', line) + else: + url = '' + + # remove the second url from the string: we don't need to capture it, because it is repetitive across + # all abstracts + second_url_regex = '^<[^\>]+>\s*' + line = re.sub(second_url_regex, '', line) + + # remove some strange line ending, that occurs in many abstracts + language_at_ending_regex = '@en \.\n$' + line = re.sub(language_at_ending_regex, '', line) + + # form the input object for this abstract + doc = { + "_text_": line, + "url": url, + "id": count+1 + } + + yield doc + count += 1 + + if count % bulk_size == 0: + print(f"Processed {count} documents", end="\r") + + if count == max_docs: + break + + source_file.close() + print("Maximum tokens observed per abstract: {}".format(max_tokens)) + +def dbpedia(out_fn): + import bz2 + from sentence_transformers import SentenceTransformer + import torch + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + print(device) + local_fn = "long_abstracts_en.ttl.bz2" + url = "http://downloads.dbpedia.org/2016-10/core-i18n/en/long_abstracts_en.ttl.bz2" + download(url, local_fn) + source_file = bz2.BZ2File(local_fn, "r") + docs_iter = parse_dbpedia_data(source_file=source_file, max_docs=1000000) + text = [] + for doc in docs_iter: + text.append(doc['_text_']) + model = SentenceTransformer('bert-base-nli-mean-tokens') + model.to(device) + sentence_embeddings = model.encode(text, show_progress_bar=True) + write_output(sentence_embeddings, sentence_embeddings[:10000], out_fn, 'angular') + DATASETS = { 'deep-image-96-angular': deep_image, @@ -474,12 +573,15 @@ def lastfm(out_fn, n_dimensions, test_size=50000): } +DATASETS['dbpedia-768'] = lambda fn: dbpedia(fn) + + big_ann_datasets = [f'Text-to-Image-{x}' for x in ['10M', '20M', '30M', '40M', '50M', '60M', '70M', '80M', '90M', '100M']] for dataset in big_ann_datasets: DATASETS[dataset] = lambda fn: () -hybrid_datasets = ['glove-200-angular', 'gist-960-euclidean', 'deep-image-96-angular'] +hybrid_datasets = ['glove-200-angular', 'gist-960-euclidean', 'deep-image-96-angular', 'fashion-mnist-784-euclidean'] hybrid_datasets.extend(big_ann_datasets) percentiles= ['0.5', '1', '2', '5', '10', '20', '50'] for dataset in hybrid_datasets: From 0dd46896dbc81a44b2a604a6b00ec4fac39b166f Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Mon, 2 May 2022 12:14:54 +0300 Subject: [PATCH 48/77] fixed PR comment --- ann_benchmarks/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index f20c8f183..6e7ed54da 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -570,10 +570,10 @@ def dbpedia(out_fn): 'sift-256-hamming': lambda out_fn: sift_hamming( out_fn, 'sift.hamming.256'), 'kosarak-jaccard': lambda out_fn: kosarak(out_fn), + 'dbpedia-768' : lambda out_fn: dbpedia(out_fn), } -DATASETS['dbpedia-768'] = lambda fn: dbpedia(fn) big_ann_datasets = [f'Text-to-Image-{x}' for x in ['10M', '20M', '30M', '40M', '50M', '60M', '70M', '80M', '90M', '100M']] From f9970c2f6f01d0b2eee798372ddc231f84909b67 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Wed, 18 May 2022 16:36:22 +0300 Subject: [PATCH 49/77] amazon reviews --- ann_benchmarks/datasets.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 6e7ed54da..f5711eb88 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -1,3 +1,4 @@ +from copyreg import pickle import h5py import numpy import os @@ -31,6 +32,8 @@ def get_dataset(which): try: if 'dbpedia' in which: url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/dbpedia/dbpedia-768.hdf5' + if 'amazon-reviews' in which: + url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/amazon_reviews/amazon-reviews-384.hdf5' elif 'hybrid' in which: url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/hybrid_datasets/%s.hdf5' % urllib.parse.quote(which) elif 'Text-to-Image' in which: @@ -533,6 +536,24 @@ def dbpedia(out_fn): write_output(sentence_embeddings, sentence_embeddings[:10000], out_fn, 'angular') +def amazon_reviews(out_fn): + import os + import math + import pickle + subsets = ['Wireless_v1_00', 'Watches_v1_00', 'Video_Games_v1_00', 'Video_DVD_v1_00', 'Video_v1_00', 'Toys_v1_00', 'Tools_v1_00', 'Sports_v1_00', 'Software_v1_00', 'Shoes_v1_00', 'Pet_Products_v1_00', 'Personal_Care_Appliances_v1_00', 'PC_v1_00', 'Outdoors_v1_00', 'Office_Products_v1_00', 'Musical_Instruments_v1_00', 'Music_v1_00', 'Mobile_Electronics_v1_00', 'Mobile_Apps_v1_00', 'Major_Appliances_v1_00', 'Luggage_v1_00', 'Lawn_and_Garden_v1_00', 'Kitchen_v1_00', 'Jewelry_v1_00', 'Home_Improvement_v1_00', 'Home_Entertainment_v1_00', 'Home_v1_00', 'Health_Personal_Care_v1_00', 'Grocery_v1_00', 'Gift_Card_v1_00', 'Furniture_v1_00', 'Electronics_v1_00', 'Digital_Video_Games_v1_00', 'Digital_Video_Download_v1_00', 'Digital_Software_v1_00', 'Digital_Music_Purchase_v1_00', 'Digital_Ebook_Purchase_v1_00', 'Camera_v1_00', 'Books_v1_00', 'Beauty_v1_00', 'Baby_v1_00', 'Automotive_v1_00', 'Apparel_v1_00', 'Digital_Ebook_Purchase_v1_01', 'Books_v1_01', 'Books_v1_02'] + train_set = [] + test_set = [] + for subset in subsets: + url = f'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/amazon_reviews/{subset}_embeddings' + local_fn = f'{subset}_embeddings' + download(url, local_fn) + subset_embeddings = pickle.load(open(local_fn, "rb")) + train_set.extend(subset_embeddings) + test_set.extend(subset_embeddings[:math.ceil(10000/len(subsets))]) + os.remove(local_fn) + write_output(train_set, test_set[:10000], out_fn, 'angular') + + DATASETS = { 'deep-image-96-angular': deep_image, 'fashion-mnist-784-euclidean': fashion_mnist, @@ -571,6 +592,7 @@ def dbpedia(out_fn): out_fn, 'sift.hamming.256'), 'kosarak-jaccard': lambda out_fn: kosarak(out_fn), 'dbpedia-768' : lambda out_fn: dbpedia(out_fn), + 'amazon-reviews-384': lambda out_fn: amazon_reviews(out_fn), } From 7aeed36adf322ceba907c40096834f3483ba33b6 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Thu, 19 May 2022 12:28:53 +0000 Subject: [PATCH 50/77] fixed amazon review dataset creation --- ann_benchmarks/datasets.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index f5711eb88..c177b4906 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -540,16 +540,24 @@ def amazon_reviews(out_fn): import os import math import pickle + import numpy as np subsets = ['Wireless_v1_00', 'Watches_v1_00', 'Video_Games_v1_00', 'Video_DVD_v1_00', 'Video_v1_00', 'Toys_v1_00', 'Tools_v1_00', 'Sports_v1_00', 'Software_v1_00', 'Shoes_v1_00', 'Pet_Products_v1_00', 'Personal_Care_Appliances_v1_00', 'PC_v1_00', 'Outdoors_v1_00', 'Office_Products_v1_00', 'Musical_Instruments_v1_00', 'Music_v1_00', 'Mobile_Electronics_v1_00', 'Mobile_Apps_v1_00', 'Major_Appliances_v1_00', 'Luggage_v1_00', 'Lawn_and_Garden_v1_00', 'Kitchen_v1_00', 'Jewelry_v1_00', 'Home_Improvement_v1_00', 'Home_Entertainment_v1_00', 'Home_v1_00', 'Health_Personal_Care_v1_00', 'Grocery_v1_00', 'Gift_Card_v1_00', 'Furniture_v1_00', 'Electronics_v1_00', 'Digital_Video_Games_v1_00', 'Digital_Video_Download_v1_00', 'Digital_Software_v1_00', 'Digital_Music_Purchase_v1_00', 'Digital_Ebook_Purchase_v1_00', 'Camera_v1_00', 'Books_v1_00', 'Beauty_v1_00', 'Baby_v1_00', 'Automotive_v1_00', 'Apparel_v1_00', 'Digital_Ebook_Purchase_v1_01', 'Books_v1_01', 'Books_v1_02'] - train_set = [] - test_set = [] - for subset in subsets: + train_set = None + test_set = None + for i, subset in enumerate(subsets): url = f'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/amazon_reviews/{subset}_embeddings' local_fn = f'{subset}_embeddings' download(url, local_fn) subset_embeddings = pickle.load(open(local_fn, "rb")) - train_set.extend(subset_embeddings) - test_set.extend(subset_embeddings[:math.ceil(10000/len(subsets))]) + if i==0: + train_set = subset_embeddings + test_set = subset_embeddings[:math.ceil(10000/len(subsets))] + else: + train_set = np.append(train_set, subset_embeddings, axis =0) + test_set = np.append(test_set, subset_embeddings[:math.ceil(10000/len(subsets))], axis=0) + print(subset_embeddings.shape) + print(train_set.shape) + print(test_set.shape) os.remove(local_fn) write_output(train_set, test_set[:10000], out_fn, 'angular') From 45669454237f94a2dcb0d2dba3ccd54e396e7692 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Tue, 24 May 2022 15:29:00 +0300 Subject: [PATCH 51/77] added shards aux arg --- ann_benchmarks/algorithms/definitions.py | 2 +- ann_benchmarks/algorithms/redisearch.py | 2 +- ann_benchmarks/main.py | 8 +++++++- multirun.py | 7 +++++++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/ann_benchmarks/algorithms/definitions.py b/ann_benchmarks/algorithms/definitions.py index 64f842474..d04ceac3b 100644 --- a/ann_benchmarks/algorithms/definitions.py +++ b/ann_benchmarks/algorithms/definitions.py @@ -109,7 +109,7 @@ def get_run_groups(definition_file, algo = None): def get_definitions(definition_file, dimension, point_type="float", - distance_metric="euclidean", count=10, conn_params={'host': None, 'port': None, 'auth': None, 'user': None, 'cluster': False}): + distance_metric="euclidean", count=10, conn_params={'host': None, 'port': None, 'auth': None, 'user': None, 'cluster': False, 'shards': 1}): definitions = _get_definitions(definition_file) algorithm_definitions = {} diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index f96dd7a8b..dc6d97f6b 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -21,7 +21,7 @@ def __init__(self, algo, metric, conn_params, method_param): port = conn_params["port"] if conn_params["port"] else 6379 self.redis = redis(host=host, port=port, decode_responses=False, password=conn_params["auth"], username=conn_params["user"]) - self.shards = 1 + self.shards = conn_params["shards"] if conn_params['cluster']: self.shards = len(self.redis.get_primaries()) diff --git a/ann_benchmarks/main.py b/ann_benchmarks/main.py index aac871b55..37865e553 100644 --- a/ann_benchmarks/main.py +++ b/ann_benchmarks/main.py @@ -172,6 +172,12 @@ def main(): type=positive_int, help='specific client id (among the total clients)', default=1) + parser.add_argument( + '--shards', + type=str, + metavar='NUM', + default="1", + help='specify number of shards') args = parser.parse_args() if args.timeout == -1: @@ -186,7 +192,7 @@ def main(): if (args.build_only or args.test_only) and not args.local: raise Exception('Can\'t run build or test only on docker') - conn_params = {'host': args.host, 'port': args.port, 'auth': args.auth, 'user': args.user, 'cluster': args.cluster} + conn_params = {'host': args.host, 'port': args.port, 'auth': args.auth, 'user': args.user, 'cluster': args.cluster, 'shards': args.shards} if args.total_clients < args.client_id: raise Exception('must satisfy 1 <= client_id <= total_clients') diff --git a/multirun.py b/multirun.py index 77c85c275..b728f2974 100644 --- a/multirun.py +++ b/multirun.py @@ -135,6 +135,12 @@ def aggregate_outputs(files, clients): '--cluster', action='store_true', help='working with a cluster') + parser.add_argument( + '--shards', + type=str, + metavar='NUM', + default="1", + help='specify number of shards') args = parser.parse_args() @@ -169,6 +175,7 @@ def aggregate_outputs(files, clients): if args.auth: base += ' --auth ' + args.auth if args.force: base += ' --force' if args.cluster: base += ' --cluster' + if args.shards: base += ' --shards' + args.shards base_build = base + ' --build-only --total-clients ' + args.build_clients base_test = base + ' --test-only --runs {} --total-clients {}'.format(args.runs, args.test_clients) From ab5ceb0241226db96abeeed61dcaddddbcf8fd78 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Wed, 25 May 2022 12:27:17 +0100 Subject: [PATCH 52/77] Fixed shards arg usage on multirun/redisearch --- ann_benchmarks/algorithms/redisearch.py | 2 +- multirun.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index dc6d97f6b..44308352c 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -21,7 +21,7 @@ def __init__(self, algo, metric, conn_params, method_param): port = conn_params["port"] if conn_params["port"] else 6379 self.redis = redis(host=host, port=port, decode_responses=False, password=conn_params["auth"], username=conn_params["user"]) - self.shards = conn_params["shards"] + self.shards = int(conn_params["shards"]) if conn_params['cluster']: self.shards = len(self.redis.get_primaries()) diff --git a/multirun.py b/multirun.py index b728f2974..ab77f1d84 100644 --- a/multirun.py +++ b/multirun.py @@ -175,7 +175,7 @@ def aggregate_outputs(files, clients): if args.auth: base += ' --auth ' + args.auth if args.force: base += ' --force' if args.cluster: base += ' --cluster' - if args.shards: base += ' --shards' + args.shards + if args.shards: base += ' --shards ' + args.shards base_build = base + ' --build-only --total-clients ' + args.build_clients base_test = base + ' --test-only --runs {} --total-clients {}'.format(args.runs, args.test_clients) From c21597d7a6ac11225d88ea7f1c89b85c0f8b6481 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Mon, 30 May 2022 20:07:47 +0300 Subject: [PATCH 53/77] redisearch ef runtime in algo name --- ann_benchmarks/algorithms/redisearch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index 44308352c..fba2298c9 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -86,3 +86,5 @@ def query(self, v, k): def freeIndex(self): self.redis.execute_command("FLUSHALL") + def __str__(self): + return self.name + f", efRuntime: {self.ef}" \ No newline at end of file From 8acc9f1d3c3215981cd9261f203d0c4673aa6d9b Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Mon, 30 May 2022 20:08:11 +0300 Subject: [PATCH 54/77] print qps to stdout --- ann_benchmarks/runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index abfff2741..2b5b4f768 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -79,6 +79,7 @@ def batch_query(X): search_time = total_time / len(X_test) avg_candidates = total_candidates / len(X_test) best_search_time = min(best_search_time, search_time) + print("qps:", len(X_test)/total_time) verbose = hasattr(algo, "query_verbose") attrs = { From 47c7ddda75b9619bd23e4acc54bef41f2c3d86a6 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Mon, 30 May 2022 20:10:42 +0300 Subject: [PATCH 55/77] new line --- ann_benchmarks/algorithms/redisearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index fba2298c9..441eff2b8 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -87,4 +87,4 @@ def freeIndex(self): self.redis.execute_command("FLUSHALL") def __str__(self): - return self.name + f", efRuntime: {self.ef}" \ No newline at end of file + return self.name + f", efRuntime: {self.ef}" From eaa462621ea9f38b7cca6421a0ac6972c4517d91 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Wed, 1 Jun 2022 18:34:18 +0300 Subject: [PATCH 56/77] fix dbpedia download --- ann_benchmarks/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index c177b4906..8d1549ce6 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -32,7 +32,7 @@ def get_dataset(which): try: if 'dbpedia' in which: url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/dbpedia/dbpedia-768.hdf5' - if 'amazon-reviews' in which: + elif 'amazon-reviews' in which: url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/amazon_reviews/amazon-reviews-384.hdf5' elif 'hybrid' in which: url = 'https://s3.us-east-1.amazonaws.com/benchmarks.redislabs/vecsim/hybrid_datasets/%s.hdf5' % urllib.parse.quote(which) From 2d6a2231de3c8776d9d616d7c5d7c577b7611e9f Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Sat, 4 Jun 2022 20:54:54 +0100 Subject: [PATCH 57/77] Enable recall/latency charts on results --- ann_benchmarks/plotting/metrics.py | 51 +++++++++++++++++++----- ann_benchmarks/plotting/plot_variants.py | 6 ++- ann_benchmarks/plotting/utils.py | 33 ++++++++++++++- 3 files changed, 78 insertions(+), 12 deletions(-) diff --git a/ann_benchmarks/plotting/metrics.py b/ann_benchmarks/plotting/metrics.py index d16d74250..0702bd317 100644 --- a/ann_benchmarks/plotting/metrics.py +++ b/ann_benchmarks/plotting/metrics.py @@ -79,6 +79,17 @@ def rel(dataset_distances, run_distances, metrics): def queries_per_second(queries, attrs): return 1.0 / attrs["best_search_time"] +def percentile_50(times): + return np.percentile(times, 50.0) * 1000.0 + +def percentile_95(times): + return np.percentile(times, 95.0) * 1000.0 + +def percentile_99(times): + return np.percentile(times, 99.0) * 1000.0 + +def percentile_999(times): + return np.percentile(times, 99.9) * 1000.0 def index_size(queries, attrs): # TODO(erikbern): should replace this with peak memory usage or something @@ -100,53 +111,73 @@ def dist_computations(queries, attrs): all_metrics = { "k-nn": { "description": "Recall", - "function": lambda true_distances, run_distances, metrics, run_attrs: knn(true_distances, run_distances, run_attrs["count"], metrics).attrs['mean'], # noqa + "function": lambda true_distances, run_distances, metrics, times, run_attrs: knn(true_distances, run_distances, run_attrs["count"], metrics).attrs['mean'], # noqa "worst": float("-inf"), "lim": [0.0, 1.03] }, "epsilon": { "description": "Epsilon 0.01 Recall", - "function": lambda true_distances, run_distances, metrics, run_attrs: epsilon(true_distances, run_distances, run_attrs["count"], metrics).attrs['mean'], # noqa + "function": lambda true_distances, run_distances, metrics, times, run_attrs: epsilon(true_distances, run_distances, run_attrs["count"], metrics).attrs['mean'], # noqa "worst": float("-inf") }, "largeepsilon": { "description": "Epsilon 0.1 Recall", - "function": lambda true_distances, run_distances, metrics, run_attrs: epsilon(true_distances, run_distances, run_attrs["count"], metrics, 0.1).attrs['mean'], # noqa + "function": lambda true_distances, run_distances, metrics, times, run_attrs: epsilon(true_distances, run_distances, run_attrs["count"], metrics, 0.1).attrs['mean'], # noqa "worst": float("-inf") }, "rel": { "description": "Relative Error", - "function": lambda true_distances, run_distances, metrics, run_attrs: rel(true_distances, run_distances, metrics), # noqa + "function": lambda true_distances, run_distances, metrics, times, run_attrs: rel(true_distances, run_distances, metrics), # noqa "worst": float("inf") }, "qps": { "description": "Queries per second (1/s)", - "function": lambda true_distances, run_distances, metrics, run_attrs: queries_per_second(true_distances, run_attrs), # noqa + "function": lambda true_distances, run_distances, metrics, times, run_attrs: queries_per_second(true_distances, run_attrs), # noqa "worst": float("-inf") }, + "p50": { + "description": "Percentile 50 (millis)", + "function": lambda true_distances, run_distances, metrics, times, run_attrs: percentile_50(times), # noqa + "worst": float("inf") + }, + "p95": { + "description": "Percentile 95 (millis)", + "function": lambda true_distances, run_distances, metrics, times, run_attrs: percentile_95(times), # noqa + "worst": float("inf") + }, + "p99": { + "description": "Percentile 99 (millis)", + "function": lambda true_distances, run_distances, metrics, times, run_attrs: percentile_99(times), # noqa + "worst": float("inf") + }, + "p999": { + "description": "Percentile 99.9 (millis)", + "function": lambda true_distances, run_distances, metrics, times, run_attrs: percentile_999(times), # noqa + "worst": float("inf") + }, "distcomps": { "description": "Distance computations", - "function": lambda true_distances, run_distances, metrics, run_attrs: dist_computations(true_distances, run_attrs), # noqa + "function": lambda true_distances, run_distances, metrics, times, run_attrs: dist_computations(true_distances, run_attrs), # noqa "worst": float("inf") }, "build": { "description": "Build time (s)", - "function": lambda true_distances, run_distances, metrics, run_attrs: build_time(true_distances, run_attrs), # noqa + "function": lambda true_distances, run_distances, metrics, times, run_attrs: build_time(true_distances, run_attrs), # noqa "worst": float("inf") }, "candidates": { "description": "Candidates generated", - "function": lambda true_distances, run_distances, metrics, run_attrs: candidates(true_distances, run_attrs), # noqa + "function": lambda true_distances, run_distances, metrics, times, run_attrs: candidates(true_distances, run_attrs), # noqa "worst": float("inf") }, "indexsize": { "description": "Index size (kB)", - "function": lambda true_distances, run_distances, metrics, run_attrs: index_size(true_distances, run_attrs), # noqa + "function": lambda true_distances, run_distances, metrics, times, run_attrs: index_size(true_distances, run_attrs), # noqa "worst": float("inf") }, "queriessize": { "description": "Index size (kB)/Queries per second (s)", - "function": lambda true_distances, run_distances, metrics, run_attrs: index_size(true_distances, run_attrs) / queries_per_second(true_distances, run_attrs), # noqa + "function": lambda true_distances, run_distances, metrics, times, run_attrs: index_size(true_distances, run_attrs) / queries_per_second(true_distances, run_attrs), # noqa "worst": float("inf") } } diff --git a/ann_benchmarks/plotting/plot_variants.py b/ann_benchmarks/plotting/plot_variants.py index f3632c00d..a30d06dfd 100644 --- a/ann_benchmarks/plotting/plot_variants.py +++ b/ann_benchmarks/plotting/plot_variants.py @@ -9,5 +9,9 @@ "recall/candidates": ("k-nn", "candidates"), "recall/qpssize": ("k-nn", "queriessize"), "eps/time": ("epsilon", "qps"), - "largeeps/time": ("largeepsilon", "qps") + "largeeps/time": ("largeepsilon", "qps"), + "recall/p50": ("k-nn", "p50"), + "recall/p95": ("k-nn", "p95"), + "recall/p99": ("k-nn", "p99"), + "recall/p999": ("k-nn", "p999"), } diff --git a/ann_benchmarks/plotting/utils.py b/ann_benchmarks/plotting/utils.py index 46b3ed4f6..623253019 100644 --- a/ann_benchmarks/plotting/utils.py +++ b/ann_benchmarks/plotting/utils.py @@ -45,6 +45,8 @@ def compute_metrics(true_nn_distances, res, metric_1, metric_2, algo_name = properties['name'] # cache distances to avoid access to hdf5 file run_distances = numpy.array(run['distances']) + # cache times to avoid access to hdf5 file + times = numpy.array(run['times']) if recompute and 'metrics' in run: del run['metrics'] metrics_cache = get_or_create_metrics(run) @@ -73,18 +75,47 @@ def compute_all_metrics(true_nn_distances, run, properties, recompute=False): results = {} # cache distances to avoid access to hdf5 file run_distances = numpy.array(run["distances"]) + # cache times to avoid access to hdf5 file + times = numpy.array(run['times']) if recompute and 'metrics' in run: del run['metrics'] metrics_cache = get_or_create_metrics(run) for name, metric in metrics.items(): v = metric["function"]( - true_nn_distances, run_distances, metrics_cache, properties) + true_nn_distances, run_distances, metrics_cache, times, properties) results[name] = v if v: print('%s: %g' % (name, v)) return (algo, algo_name, results) +def compute_metrics_all_runs(dataset, res, recompute=False): + true_nn_distances=list(dataset['distances']) + for i, (properties, run) in enumerate(res): + algo = properties['algo'] + algo_name = properties['name'] + # cache distances to avoid access to hdf5 file + # print('Load distances and times') + run_distances = numpy.array(run['distances']) + times = numpy.array(run['times']) + # print('... done') + if recompute and 'metrics' in run: + print('Recomputing metrics, clearing cache') + del run['metrics'] + metrics_cache = get_or_create_metrics(run) + + dataset = properties['dataset'] + + run_result = { + 'algorithm': algo, + 'parameters': algo_name, + 'count': properties['count'] + } + for name, metric in metrics.items(): + v = metric["function"](true_nn_distances, run_distances, metrics_cache, times, properties) + run_result[name] = v + yield run_result + def generate_n_colors(n): vs = numpy.linspace(0.3, 0.9, 7) From 3cd6fbb926820f69f4a7afc7f3f3178174075728 Mon Sep 17 00:00:00 2001 From: DvirDukhan Date: Mon, 6 Jun 2022 08:51:42 +0300 Subject: [PATCH 58/77] removed create command optimizations --- ann_benchmarks/algorithms/redisearch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index 441eff2b8..f37030139 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -33,9 +33,9 @@ def fit(self, X, offset=0, limit=None, hybrid_buckets = None): args.extend(['n', 'NUMERIC', 't', 'TEXT']) # https://oss.redis.com/redisearch/master/Commands/#ftcreate if self.algo == "HNSW": - args.extend(['vector', 'VECTOR', self.algo, '12', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', math.ceil(len(X)/self.shards), 'M', self.method_param['M'], 'EF_CONSTRUCTION', self.method_param["efConstruction"]]) + args.extend(['vector', 'VECTOR', self.algo, '10', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'M', self.method_param['M'], 'EF_CONSTRUCTION', self.method_param["efConstruction"]]) elif self.algo == "FLAT": - args.extend(['vector', 'VECTOR', self.algo, '10', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric, 'INITIAL_CAP', math.ceil(len(X)/self.shards), 'BLOCK_SIZE', self.method_param['BLOCK_SIZE']]) + args.extend(['vector', 'VECTOR', self.algo, '6', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric]) print("Calling FT.CREATE", *args) self.redis.execute_command('FT.CREATE', *args, target_nodes='random') except Exception as e: From cf559609a64d9ac633dfda2a33a40a098ab90bb9 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Mon, 6 Jun 2022 15:47:49 +0100 Subject: [PATCH 59/77] Fixes per PR review --- ann_benchmarks/plotting/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ann_benchmarks/plotting/utils.py b/ann_benchmarks/plotting/utils.py index 623253019..4b18bb1b2 100644 --- a/ann_benchmarks/plotting/utils.py +++ b/ann_benchmarks/plotting/utils.py @@ -53,10 +53,10 @@ def compute_metrics(true_nn_distances, res, metric_1, metric_2, metric_1_value = metrics[metric_1]['function']( true_nn_distances, - run_distances, metrics_cache, properties) + run_distances, metrics_cache, times, properties) metric_2_value = metrics[metric_2]['function']( true_nn_distances, - run_distances, metrics_cache, properties) + run_distances, metrics_cache, times, properties) print('%3d: %80s %12.3f %12.3f' % (i, algo_name, metric_1_value, metric_2_value)) From cbe99b0059a6dccca7210feac98f5b13c9809471 Mon Sep 17 00:00:00 2001 From: GuyAv46 <47632673+GuyAv46@users.noreply.github.com> Date: Tue, 14 Jun 2022 11:47:18 +0300 Subject: [PATCH 60/77] Milvus update (#28) * updating milvus code * added website dir to git ignore --- .gitignore | 1 + ann_benchmarks/algorithms/milvus.py | 32 ++++++++++++++++++++++++++--- multirun.py | 11 ++++++++-- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 2ae6d3de3..08e22ea94 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ data/* results/* !results/*.png +website venv diff --git a/ann_benchmarks/algorithms/milvus.py b/ann_benchmarks/algorithms/milvus.py index 78e2c1ba9..a763e1e00 100644 --- a/ann_benchmarks/algorithms/milvus.py +++ b/ann_benchmarks/algorithms/milvus.py @@ -12,6 +12,7 @@ import numpy import sklearn.preprocessing from ann_benchmarks.algorithms.base import BaseANN +import sys class Milvus(BaseANN): @@ -29,10 +30,13 @@ def __init__(self, metric, dim, conn_params, index_type, method_params): FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] schema = CollectionSchema(fields) - self._milvus = Collection('milvus', schema) - self._milvus.create_index('vector', {'index_type': self._index_type, 'metric_type':self._metric, 'params':self._method_params}) + if utility.has_collection('milvus'): + self._milvus = Collection('milvus') + else: + self._milvus = Collection('milvus', schema) except: self._milvus = Collection('milvus') + print('initialization completed!') def fit(self, X, offset=0, limit=None): limit = limit if limit else len(X) @@ -40,12 +44,31 @@ def fit(self, X, offset=0, limit=None): if self._metric == 'IP': X = sklearn.preprocessing.normalize(X) - self._milvus.insert([[id for id in range(offset, limit)], X.tolist()]) + X = X.tolist() + bulk_size = 1000 * 1024 * 1024 // (sys.getsizeof(X[0])) # approximation for milvus insert limit (1024MB) + for bulk in [X[i: i+bulk_size] for i in range(0, len(X), bulk_size)]: + print(f'inserting vectors {offset} to {len(bulk)}') + self._milvus.insert([list(range(offset, len(bulk))), bulk]) + offset += len(bulk) + + if not self._milvus.has_index(): + print('indexing...', end=' ') + try: + self._milvus.create_index('vector', {'index_type': self._index_type, 'metric_type':self._metric, 'params':self._method_params}) + print('done!') + except: + print('failed!') + def set_query_arguments(self, param): if self._milvus.has_index(): + print('waiting for index... ', end='') if utility.wait_for_index_building_complete('milvus', 'vector'): + print('done!') self._milvus.load() + print('waiting for data to be loaded... ', end='') + utility.wait_for_loading_complete('milvus') + print('done!') else: raise Exception('index has error') else: raise Exception('index is missing') if 'IVF_' in self._index_type: @@ -71,3 +94,6 @@ def __str__(self): def freeIndex(self): utility.drop_collection("mlivus") + + def done(self): + connections.disconnect('default') diff --git a/multirun.py b/multirun.py index ab77f1d84..e16681ef6 100644 --- a/multirun.py +++ b/multirun.py @@ -5,6 +5,7 @@ from numpy import average from redis import Redis from redis.cluster import RedisCluster +from pymilvus import utility, connections import h5py import os from watchdog.observers import Observer @@ -151,16 +152,19 @@ def aggregate_outputs(files, clients): os.chdir(workdir) isredis = True if 'redisearch' in args.algorithm else False + ismilvus = True if 'milvus' in args.algorithm else False if args.host is None: args.host = 'localhost' if args.port is None: - if 'redisearch' in args.algorithm: args.port = '6379' - if 'milvus' in args.algorithm: args.port = '19530' + if isredis: args.port = '6379' + elif ismilvus: args.port = '19530' if isredis: redis = RedisCluster if args.cluster else Redis redis = redis(host=args.host, port=int(args.port), password=args.auth, username=args.user) + elif ismilvus: + connections.connect(host=args.host, port=args.port) if args.run_group is not None: run_groups = [args.run_group] @@ -204,6 +208,9 @@ def on_created_or_modified(event): for run_group in run_groups: if isredis: redis.flushall() + elif ismilvus: + if utility.has_collection('milvus'): + utility.drop_collection('milvus') results_dict = {} curr_base_build = base_build + ' --run-group ' + run_group From 592f3ed8187ea25009259f71e0324784ab9da206 Mon Sep 17 00:00:00 2001 From: GuyAv46 <47632673+GuyAv46@users.noreply.github.com> Date: Thu, 23 Jun 2022 18:45:28 +0300 Subject: [PATCH 61/77] Added pinecone client (#29) * added pinecone client * added support for pinecone in multirun.py * removed unnecesarry var * added 'exact' run group for pinecone * generalized pinecone flush --- algos.yaml | 10 +++++++ ann_benchmarks/algorithms/pinecone.py | 39 +++++++++++++++++++++++++++ multirun.py | 16 ++++++++--- 3 files changed, 62 insertions(+), 3 deletions(-) create mode 100644 ann_benchmarks/algorithms/pinecone.py diff --git a/algos.yaml b/algos.yaml index 1e48525a2..231f08667 100644 --- a/algos.yaml +++ b/algos.yaml @@ -51,6 +51,16 @@ float: BS-2^20: arg-groups: - {"BLOCK_SIZE": 1048576} + pinecone: + docker-tag: ann-benchmarks-pinecone + module: ann_benchmarks.algorithms.pinecone + constructor: Pinecone + base-args: ["@metric", "@dimension", "@connection"] + run-groups: + approximated: + args: [['approximated']] + exact: + args: [['exact']] sptag: docker-tag: ann-benchmarks-sptag module: ann_benchmarks.algorithms.sptag diff --git a/ann_benchmarks/algorithms/pinecone.py b/ann_benchmarks/algorithms/pinecone.py new file mode 100644 index 000000000..f7d09c174 --- /dev/null +++ b/ann_benchmarks/algorithms/pinecone.py @@ -0,0 +1,39 @@ +from __future__ import absolute_import +from sqlite3 import paramstyle +from ann_benchmarks.algorithms.base import BaseANN +import sys +import pinecone + +class Pinecone(BaseANN): + def __init__(self, metric, dim, conn_params, type): + pinecone.init(api_key=conn_params['auth']) + m = {'angular': 'cosine', 'euclidean': 'euclidean'}[metric] + self.name = 'ann-benchmark' + if self.name not in pinecone.list_indexes(): + pinecone.create_index(self.name, dimension=dim, metric=m, + index_type=type, shards=int(conn_params["shards"]), ) + self.index = pinecone.Index(self.name) + + def fit(self, X, offset=0, limit=None): + limit = limit if limit else len(X) + + bulk = [(str(i), X[i].tolist()) for i in range(offset, limit)] + # approximation for pinecone insert limit (2MB or 1000 vectors) + batch_size = min(1000, 2 * 1024 * 1024 // (sys.getsizeof(bulk[-1]))) # bulk[-1] should be the largest (longest name) + + for batch in [bulk[i: i+batch_size] for i in range(0, len(bulk), batch_size)]: + # print(f'inserting vectors {batch[0][0]} to {batch[-1][0]}') + self.index.upsert(batch) + + # print(self.index.describe_index_stats()) + # print(pinecone.describe_index(self.name)) + + def query(self, v, n): + res = self.index.query(v.tolist(), top_k=n) + return [int(e['id']) for e in res['matches']] + + def freeIndex(self): + pinecone.delete_index(self.name) + + def __str__(self): + return f'Pinecone({pinecone.describe_index(self.name)})' diff --git a/multirun.py b/multirun.py index e16681ef6..b81e46b36 100644 --- a/multirun.py +++ b/multirun.py @@ -3,9 +3,6 @@ import time import json from numpy import average -from redis import Redis -from redis.cluster import RedisCluster -from pymilvus import utility, connections import h5py import os from watchdog.observers import Observer @@ -14,6 +11,13 @@ from ann_benchmarks.results import get_result_filename from ann_benchmarks.algorithms.definitions import get_run_groups +from redis import Redis +from redis.cluster import RedisCluster + +from pymilvus import utility, connections + +import pinecone + def aggregate_outputs(files, clients): different_attrs = set([f.split('client')[0] for f in files]) @@ -153,6 +157,7 @@ def aggregate_outputs(files, clients): isredis = True if 'redisearch' in args.algorithm else False ismilvus = True if 'milvus' in args.algorithm else False + ispinecone = True if 'pinecone' in args.algorithm else False if args.host is None: args.host = 'localhost' @@ -165,6 +170,8 @@ def aggregate_outputs(files, clients): redis = redis(host=args.host, port=int(args.port), password=args.auth, username=args.user) elif ismilvus: connections.connect(host=args.host, port=args.port) + elif ispinecone: + pinecone.init(api_key=args.auth) if args.run_group is not None: run_groups = [args.run_group] @@ -211,6 +218,9 @@ def on_created_or_modified(event): elif ismilvus: if utility.has_collection('milvus'): utility.drop_collection('milvus') + elif ispinecone: + for idx in pinecone.list_indexes(): + pinecone.delete_index(idx) results_dict = {} curr_base_build = base_build + ' --run-group ' + run_group From 12183f145d0b76e56312937147f7b8ba6e0a2577 Mon Sep 17 00:00:00 2001 From: GuyAv46 <47632673+GuyAv46@users.noreply.github.com> Date: Thu, 23 Jun 2022 18:46:21 +0300 Subject: [PATCH 62/77] splitting the test load between test clients (#30) --- ann_benchmarks/runner.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index 2b5b4f768..46c768b9f 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -157,6 +157,14 @@ def run(definition, dataset, count, run_count, batch, build_only, test_only, num if not build_only: print('got %d queries' % len(X_test)) + per_client = len(X_test) // num_clients + offset = per_client * (id - 1) + if (num_clients != id): + X_test = X_test[offset : offset + per_client] + else: + X_test = X_test[offset:] + print('running %d out of them' % len(X_test)) + for pos, query_arguments in enumerate(query_argument_groups, 1): print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) From 58c10f6d76083b3bbfa01d8916e75415d90aab54 Mon Sep 17 00:00:00 2001 From: esandoval30 Date: Tue, 5 Jul 2022 10:37:18 +0100 Subject: [PATCH 63/77] Update requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 99b97aa38..0bdd03e3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,5 @@ psutil==5.6.6 scipy==1.3.3 scikit-learn==0.22.2 jinja2==2.10 +pymilvus==2.0.2 +pinecone-client==2.0.11 From accc74464d3d309865c763f834a94facd19957f6 Mon Sep 17 00:00:00 2001 From: esandoval30 Date: Tue, 5 Jul 2022 11:12:40 +0100 Subject: [PATCH 64/77] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 0bdd03e3a..fc3552900 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ scikit-learn==0.22.2 jinja2==2.10 pymilvus==2.0.2 pinecone-client==2.0.11 +redis==4.3.2 From fb3a46ca7f7b0a4c2661669910ab57465b5ea1d4 Mon Sep 17 00:00:00 2001 From: GuyAv46 <47632673+GuyAv46@users.noreply.github.com> Date: Tue, 19 Jul 2022 10:52:48 +0300 Subject: [PATCH 65/77] fixing double fetching(#33) --- ann_benchmarks/runner.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index 46c768b9f..1ad0a83c5 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -107,13 +107,10 @@ def run(definition, dataset, count, run_count, batch, build_only, test_only, num function""" % (definition.module, definition.constructor, definition.arguments) D, dimension = get_dataset(dataset) - X_train = numpy.array(D['train']) - X_test = numpy.array(D['test']) + X_train, X_test = dataset_transform(D) distance = D.attrs['distance'] print('got a train set of size (%d * %d)' % (X_train.shape[0], dimension)) - X_train, X_test = dataset_transform(D) - hybrid_buckets = None if 'bucket_names' in D.attrs: hybrid_buckets = {} From 020ad0812a0d44e4111b239426309a325c29e24b Mon Sep 17 00:00:00 2001 From: GuyAv46 <47632673+GuyAv46@users.noreply.github.com> Date: Thu, 21 Jul 2022 12:23:41 +0300 Subject: [PATCH 66/77] fixing bulk insertion (#36) --- ann_benchmarks/algorithms/milvus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ann_benchmarks/algorithms/milvus.py b/ann_benchmarks/algorithms/milvus.py index a763e1e00..da961d408 100644 --- a/ann_benchmarks/algorithms/milvus.py +++ b/ann_benchmarks/algorithms/milvus.py @@ -47,8 +47,8 @@ def fit(self, X, offset=0, limit=None): X = X.tolist() bulk_size = 1000 * 1024 * 1024 // (sys.getsizeof(X[0])) # approximation for milvus insert limit (1024MB) for bulk in [X[i: i+bulk_size] for i in range(0, len(X), bulk_size)]: - print(f'inserting vectors {offset} to {len(bulk)}') - self._milvus.insert([list(range(offset, len(bulk))), bulk]) + print(f'inserting vectors {offset} to {offset + len(bulk) - 1}') + self._milvus.insert([list(range(offset, offset + len(bulk))), bulk]) offset += len(bulk) if not self._milvus.has_index(): From 3b5012d1edc7d68ae284b34e33f07bbd47a89f33 Mon Sep 17 00:00:00 2001 From: GuyAv46 <47632673+GuyAv46@users.noreply.github.com> Date: Tue, 16 Aug 2022 14:38:20 +0300 Subject: [PATCH 67/77] Elastic client update (#34) * first changes * more work * make it work * added certification support * improved importing * supporting both secure and not secure settings * some cleanup * another requirement update * improved multirun on existing index * added shard settings * improved `aggregate_outputs` --- algos.yaml | 48 +++++++++--- ann_benchmarks/algorithms/elasticsearch.py | 87 ++++++++++++---------- multirun.py | 78 ++++++++++++------- requirements.txt | 3 +- 4 files changed, 135 insertions(+), 81 deletions(-) diff --git a/algos.yaml b/algos.yaml index 231f08667..942a3a321 100644 --- a/algos.yaml +++ b/algos.yaml @@ -672,14 +672,6 @@ float: - {"n_neighbors": 60, "diversify_prob": 0.0, "pruning_degree_multiplier":[2.0, 3.0], "leaf_size": 48} query-args: [[0.0, 0.04, 0.08, 0.12, 0.16, 0.20, 0.24, 0.28, 0.32, 0.36]] - elasticsearch: - docker-tag: ann-benchmarks-elasticsearch - module: ann_benchmarks.algorithms.elasticsearch - constructor: ElasticsearchScriptScoreQuery - base-args: [ "@metric", "@dimension" ] - run-groups: - empty: - args: [] elastiknn-l2lsh: docker-tag: ann-benchmarks-elastiknn module: ann_benchmarks.algorithms.elastiknn @@ -974,10 +966,44 @@ float: docker-tag: ann-benchmarks-elasticsearch module: ann_benchmarks.algorithms.elasticsearch constructor: ElasticsearchScriptScoreQuery - base-args: [ "@metric", "@dimension" ] + base-args: [ "@metric", "@dimension", "@connection" ] run-groups: - empty: - args: [] + M-4: + arg-groups: + - {"m": 4, "ef_construction": 500, "type": "hnsw"} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-8: + arg-groups: + - {"m": 8, "ef_construction": 500, "type": "hnsw"} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-12: + arg-groups: + - {"m": 12, "ef_construction": 500, "type": "hnsw"} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-16: + arg-groups: + - {"m": 16, "ef_construction": 500, "type": "hnsw"} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-24: + arg-groups: + - {"m": 24, "ef_construction": 500, "type": "hnsw"} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-36: + arg-groups: + - {"m": 36, "ef_construction": 500, "type": "hnsw"} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-48: + arg-groups: + - {"m": 48, "ef_construction": 500, "type": "hnsw"} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-64: + arg-groups: + - {"m": 64, "ef_construction": 500, "type": "hnsw"} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-96: + arg-groups: + - {"m": 96, "ef_construction": 500, "type": "hnsw"} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] opensearchknn: docker-tag: ann-benchmarks-opensearchknn module: ann_benchmarks.algorithms.opensearchknn diff --git a/ann_benchmarks/algorithms/elasticsearch.py b/ann_benchmarks/algorithms/elasticsearch.py index 95eccab09..989cf4aa8 100644 --- a/ann_benchmarks/algorithms/elasticsearch.py +++ b/ann_benchmarks/algorithms/elasticsearch.py @@ -4,11 +4,12 @@ """ import logging from time import sleep +from os import environ from urllib.error import URLError -from urllib.request import Request, urlopen -from elasticsearch import Elasticsearch +from elasticsearch import Elasticsearch, BadRequestError from elasticsearch.helpers import bulk +from elastic_transport.client_utils import DEFAULT from ann_benchmarks.algorithms.base import BaseANN @@ -20,19 +21,18 @@ # logging.basicConfig(level=logging.INFO) # logging.getLogger("elasticsearch").setLevel(logging.INFO) -def es_wait(): +def es_wait(es): print("Waiting for elasticsearch health endpoint...") - req = Request("http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=1s") for i in range(30): try: - res = urlopen(req) - if res.getcode() == 200: + res = es.cluster.health(wait_for_status='yellow', timeout='1s') + if not res['timed_out']: # then status is OK print("Elasticsearch is ready") return except URLError: pass sleep(1) - raise RuntimeError("Failed to connect to local elasticsearch") + raise RuntimeError("Failed to connect to elasticsearch server") class ElasticsearchScriptScoreQuery(BaseANN): @@ -42,57 +42,62 @@ class ElasticsearchScriptScoreQuery(BaseANN): - Dense vector queries: https://www.elastic.co/guide/en/elasticsearch/reference/master/query-dsl-script-score-query.html """ - def __init__(self, metric: str, dimension: int): - self.name = f"elasticsearch-script-score-query_metric={metric}_dimension={dimension}" - self.metric = metric + def __init__(self, metric: str, dimension: int, conn_params, method_param): + self.name = f"elasticsearch-script-score-query_metric={metric}_dimension={dimension}_params{method_param}" + self.metric = {"euclidean": 'l2_norm', "angular": 'cosine'}[metric] + self.method_param = method_param self.dimension = dimension - self.index = f"es-ssq-{metric}-{dimension}" - self.es = Elasticsearch(["http://localhost:9200"]) + self.timeout = 60 * 60 + h = conn_params['host'] if conn_params['host'] is not None else 'localhost' + p = conn_params['port'] if conn_params['port'] is not None else '9200' + u = conn_params['user'] if conn_params['user'] is not None else 'elastic' + a = conn_params['auth'] if conn_params['auth'] is not None else '' + self.index = "ann_benchmark" + self.shards = conn_params['shards'] + try: + self.es = Elasticsearch(f"http://{h}:{p}", request_timeout=self.timeout, basic_auth=(u, a), refresh_interval=-1) + self.es.info() + except Exception: + self.es = Elasticsearch(f"https://{h}:{p}", request_timeout=self.timeout, basic_auth=(u, a), ca_certs=environ.get('ELASTIC_CA', DEFAULT)) self.batch_res = [] - if self.metric == "euclidean": - self.script = "1 / (1 + l2norm(params.query_vec, \"vec\"))" - elif self.metric == "angular": - self.script = "1.0 + cosineSimilarity(params.query_vec, \"vec\")" - else: - raise NotImplementedError(f"Not implemented for metric {self.metric}") - es_wait() + es_wait(self.es) def fit(self, X): - body = dict(settings=dict(number_of_shards=1, number_of_replicas=0)) - mapping = dict( + mappings = dict( properties=dict( id=dict(type="keyword", store=True), - vec=dict(type="dense_vector", dims=self.dimension) + vec=dict( + type="dense_vector", + dims=self.dimension, + similarity=self.metric, + index=True, + index_options=self.method_param + ) ) ) - self.es.indices.create(self.index, body=body) - self.es.indices.put_mapping(mapping, self.index) + try: + self.es.indices.create(index=self.index, mappings=mappings, settings=dict(number_of_shards=self.shards, number_of_replicas=0)) + except BadRequestError as e: + if 'resource_already_exists_exception' not in e.message: raise e def gen(): for i, vec in enumerate(X): - yield { "_op_type": "index", "_index": self.index, "vec": vec.tolist(), 'id': str(i + 1) } + yield { "_op_type": "index", "_index": self.index, "vec": vec.tolist(), 'id': str(i) } (_, errors) = bulk(self.es, gen(), chunk_size=500, max_retries=9) assert len(errors) == 0, errors - self.es.indices.refresh(self.index) - self.es.indices.forcemerge(self.index, max_num_segments=1) + self.es.indices.refresh(index=self.index) + self.es.indices.forcemerge(index=self.index, max_num_segments=1) + + def set_query_arguments(self, ef): + self.ef = ef def query(self, q, n): - body = dict( - query=dict( - script_score=dict( - query=dict(match_all=dict()), - script=dict( - source=self.script, - params=dict(query_vec=q.tolist()) - ) - ) - ) - ) - res = self.es.search(index=self.index, body=body, size=n, _source=False, docvalue_fields=['id'], - stored_fields="_none_", filter_path=["hits.hits.fields.id"]) - return [int(h['fields']['id'][0]) - 1 for h in res['hits']['hits']] + knn = dict(field='vec', query_vector=q.tolist(), k=n, num_candidates=self.ef) + res = self.es.knn_search(index=self.index, knn=knn, source=False, docvalue_fields=['id'], + stored_fields="_none_", filter_path=["hits.hits.fields.id"]) + return [int(h['fields']['id'][0]) for h in res['hits']['hits']] def batch_query(self, X, n): self.batch_res = [self.query(q, n) for q in X] diff --git a/multirun.py b/multirun.py index b81e46b36..b4fb58ab4 100644 --- a/multirun.py +++ b/multirun.py @@ -11,13 +11,6 @@ from ann_benchmarks.results import get_result_filename from ann_benchmarks.algorithms.definitions import get_run_groups -from redis import Redis -from redis.cluster import RedisCluster - -from pymilvus import utility, connections - -import pinecone - def aggregate_outputs(files, clients): different_attrs = set([f.split('client')[0] for f in files]) @@ -48,15 +41,10 @@ def aggregate_outputs(files, clients): f.attrs["best_search_time"] = average([fi.attrs["best_search_time"] for fi in fs]) f.attrs["candidates"] = average([fi.attrs["candidates"] for fi in fs]) - times = f.create_dataset('times', fs[0]['times'].shape, 'f') - neighbors = f.create_dataset('neighbors', fs[0]['neighbors'].shape, 'i') - distances = f.create_dataset('distances', fs[0]['distances'].shape, 'f') - num_tests = len(times) - - for i in range(num_tests): - neighbors[i] = [n for n in fs[0]['neighbors'][i]] - distances[i] = [n for n in fs[0]['distances'][i]] - times[i] = average([fi['times'][i] for fi in fs]) + # As we split the test work between the clients, wee should concatenate their results + f['times'] = [t for fi in fs for t in fi['times']] + f['neighbors'] = [n for fi in fs for n in fi['neighbors']] + f['distances'] = [d for fi in fs for d in fi['distances']] [fi.close() for fi in fs] [os.remove(fi) for fi in group] @@ -155,15 +143,33 @@ def aggregate_outputs(files, clients): print("Changing the workdir to {}".format(workdir)) os.chdir(workdir) - isredis = True if 'redisearch' in args.algorithm else False - ismilvus = True if 'milvus' in args.algorithm else False - ispinecone = True if 'pinecone' in args.algorithm else False + # All supported algorithms that need spacial stuff + isredis = ismilvus = ispinecone = iselastic = False + + if 'redisearch' in args.algorithm: + from redis import Redis + from redis.cluster import RedisCluster + isredis = True + + elif 'milvus' in args.algorithm: + from pymilvus import utility, connections + ismilvus = True + + elif 'pinecone' in args.algorithm: + import pinecone + ispinecone = True + + elif 'elasticsearch' in args.algorithm: + from elasticsearch import Elasticsearch + from elastic_transport.client_utils import DEFAULT + iselastic = True if args.host is None: args.host = 'localhost' if args.port is None: if isredis: args.port = '6379' elif ismilvus: args.port = '19530' + elif iselastic: args.port = '9200' if isredis: redis = RedisCluster if args.cluster else Redis @@ -172,6 +178,14 @@ def aggregate_outputs(files, clients): connections.connect(host=args.host, port=args.port) elif ispinecone: pinecone.init(api_key=args.auth) + elif iselastic: + args.user = args.user if args.user is not None else 'elastic' + args.auth = args.auth if args.auth is not None else os.environ.get('ELASTIC_PASSWORD', '') + try: + es = Elasticsearch([f'http://{args.host}:{args.port}'], request_timeout=3600, basic_auth=(args.user, args.auth)) + es.info() + except Exception: + es = Elasticsearch([f'https://{args.host}:{args.port}'], request_timeout=3600, basic_auth=(args.user, args.auth), ca_certs=os.environ.get('ELASTIC_CA', DEFAULT)) if args.run_group is not None: run_groups = [args.run_group] @@ -213,20 +227,23 @@ def on_created_or_modified(event): observer.start() for run_group in run_groups: - if isredis: - redis.flushall() - elif ismilvus: - if utility.has_collection('milvus'): - utility.drop_collection('milvus') - elif ispinecone: - for idx in pinecone.list_indexes(): - pinecone.delete_index(idx) - results_dict = {} curr_base_build = base_build + ' --run-group ' + run_group curr_base_test = base_test + ' --run-group ' + run_group if int(args.build_clients) > 0: + if isredis: + redis.flushall() + elif ismilvus: + if utility.has_collection('milvus'): + utility.drop_collection('milvus') + elif ispinecone: + for idx in pinecone.list_indexes(): + pinecone.delete_index(idx) + elif iselastic: + for idx in es.indices.stats()['indices']: + es.indices.delete(index=idx) + clients = [Process(target=os.system, args=(curr_base_build + ' --client-id ' + str(i),)) for i in range(1, int(args.build_clients) + 1)] @@ -245,6 +262,8 @@ def on_created_or_modified(event): if not args.cluster: # TODO: get total size from all the shards index_size = float(redis.ft('ann_benchmark').info()['vector_index_sz_mb']) * 1024 f.attrs["index_size"] = index_size + elif iselastic: + f.attrs["index_size"] = es.indices.stats(index='ann_benchmark')['indices']['ann_benchmark']['total']['store']['size_in_bytes'] f.close() results_dict["build"] = {"total_clients": args.build_clients, "build_time": total_time, "vector_index_sz_mb": index_size} @@ -267,6 +286,9 @@ def on_created_or_modified(event): observer.join() print( f'summarizing {int(args.test_clients)} clients data ({len(test_stats_files)} files into {len(test_stats_files) // int(args.test_clients)})...') + # ls = os.listdir(outputsdir) + # ls.remove('build_stats') + # aggregate_outputs(ls, int(args.test_clients)) aggregate_outputs(test_stats_files, int(args.test_clients)) print('done!') diff --git a/requirements.txt b/requirements.txt index fc3552900..2885ef7f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,8 @@ pyyaml==5.4 psutil==5.6.6 scipy==1.3.3 scikit-learn==0.22.2 -jinja2==2.10 +jinja2==3.1.2 pymilvus==2.0.2 pinecone-client==2.0.11 redis==4.3.2 +elasticsearch==8.3.1 From 80e8bf1db34f80ea93ed7596cddc991b757ee736 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Wed, 8 Mar 2023 11:09:06 +0200 Subject: [PATCH 68/77] added vecsim lib algo --- algos.yaml | 54 +++++++++++++++++++++--- ann_benchmarks/algorithms/vecsim-hnsw.py | 43 +++++++++++++++++++ ann_benchmarks/main.py | 2 +- ann_benchmarks/runner.py | 4 +- 4 files changed, 94 insertions(+), 9 deletions(-) create mode 100644 ann_benchmarks/algorithms/vecsim-hnsw.py diff --git a/algos.yaml b/algos.yaml index 942a3a321..838dd3ea7 100644 --- a/algos.yaml +++ b/algos.yaml @@ -48,9 +48,51 @@ float: constructor: RediSearch base-args: ["FLAT", "@metric", "@connection"] run-groups: - BS-2^20: + BS-2^10: arg-groups: - - {"BLOCK_SIZE": 1048576} + - {"BLOCK_SIZE": 1024} + + vecsim-hnsw: + module: ann_benchmarks.algorithms.vecsim-hnsw + constructor: VecSimHnsw + base-args: ["@metric"] + run-groups: + M-4: + arg-groups: + - {"M": 4, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-8: + arg-groups: + - {"M": 8, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-12: + arg-groups: + - {"M": 12, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-16: + arg-groups: + - {"M": 16, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-24: + arg-groups: + - {"M": 24, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-36: + arg-groups: + - {"M": 36, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-48: + arg-groups: + - {"M": 48, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-64: + arg-groups: + - {"M": 64, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + M-96: + arg-groups: + - {"M": 96, "efConstruction": 500} + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] pinecone: docker-tag: ann-benchmarks-pinecone module: ann_benchmarks.algorithms.pinecone @@ -119,10 +161,10 @@ float: base: args: [[400, 1024, 4096, 8192, 16384], [1, 10, 40, 100, 200]] - hnswlib: + vecsim-hnsw-blocks: docker-tag: ann-benchmarks-hnswlib - module: ann_benchmarks.algorithms.hnswlib - constructor: HnswLib + module: ann_benchmarks.algorithms.vecsim-hnsw + constructor: VecSimHnsw base-args: ["@metric"] run-groups: M-4: @@ -325,7 +367,7 @@ float: M-96: arg-groups: - {"M": 96, "efConstruction": 500} - query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + query-args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] bruteforce: disabled: true diff --git a/ann_benchmarks/algorithms/vecsim-hnsw.py b/ann_benchmarks/algorithms/vecsim-hnsw.py new file mode 100644 index 000000000..d85fe7836 --- /dev/null +++ b/ann_benchmarks/algorithms/vecsim-hnsw.py @@ -0,0 +1,43 @@ +from __future__ import absolute_import +import os +from VecSim import * +import numpy as np +from ann_benchmarks.constants import INDEX_DIR +from ann_benchmarks.algorithms.base import BaseANN + + +class VecSimHnsw(BaseANN): + def __init__(self, metric, method_param): + self.metric = {'angular': VecSimMetric_Cosine, 'euclidean': VecSimMetric_L2}[metric] + self.method_param = method_param + # print(self.method_param,save_index,query_param) + self.ef = None + self.name = 'VecSim-hnsw (%s)' % (self.method_param) + + def fit(self, X): + hnswparams = HNSWParams() + hnswparams.M =self.method_param['M'] + hnswparams.efConstruction = self.method_param['efConstruction'] + hnswparams.initialCapacity = len(X) + hnswparams.dim = len(X[0]) + hnswparams.type = VecSimType_FLOAT32 + hnswparams.metric = self.metric + hnswparams.multi = False + + self.index = HNSWIndex(hnswparams) + + for i, vector in enumerate(X): + self.index.add_vector(vector, i) + + def set_query_arguments(self, ef): + self.ef = ef + self.index.set_ef(ef) + + def query(self, v, n): + return self.index.knn_query(np.expand_dims(v, axis=0), k=n)[0][0] + + def freeIndex(self): + del self.index + + def __str__(self): + return f"{self.name}, efRuntime: {self.ef}" diff --git a/ann_benchmarks/main.py b/ann_benchmarks/main.py index 37865e553..6e9064644 100644 --- a/ann_benchmarks/main.py +++ b/ann_benchmarks/main.py @@ -186,7 +186,7 @@ def main(): if args.list_algorithms: list_algorithms(args.definitions) sys.exit(0) - + if args.build_only and args.test_only: raise Exception('Nothing to run (build only and test only was specified)') if (args.build_only or args.test_only) and not args.local: diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index 1ad0a83c5..89e41ac1d 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -161,7 +161,7 @@ def run(definition, dataset, count, run_count, batch, build_only, test_only, num else: X_test = X_test[offset:] print('running %d out of them' % len(X_test)) - + for pos, query_arguments in enumerate(query_argument_groups, 1): print("Running query argument group %d of %d..." % (pos, len(query_argument_groups))) @@ -324,7 +324,7 @@ def _handle_container_return_value(return_value, container, logger): error_msg = return_value['Error'] exit_code = return_value['StatusCode'] msg = base_msg + 'returned exit code %d with message %s' %(exit_code, error_msg) - else: + else: exit_code = return_value msg = base_msg + 'returned exit code %d' % (exit_code) From d19900fd272199b93d2f94ff969e3a789982489b Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Wed, 8 Mar 2023 19:29:21 +0200 Subject: [PATCH 69/77] add dummy docker tag --- algos.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/algos.yaml b/algos.yaml index 838dd3ea7..e68ebd0fc 100644 --- a/algos.yaml +++ b/algos.yaml @@ -53,6 +53,7 @@ float: - {"BLOCK_SIZE": 1024} vecsim-hnsw: + docker-tag: ann-benchmarks-vecsim module: ann_benchmarks.algorithms.vecsim-hnsw constructor: VecSimHnsw base-args: ["@metric"] From e0929925fc7b7a7f0b53b11523b0e11ab93cacd8 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 9 Mar 2023 14:15:58 +0200 Subject: [PATCH 70/77] removing password from filename --- ann_benchmarks/results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ann_benchmarks/results.py b/ann_benchmarks/results.py index 7f679a0af..d87352ac5 100644 --- a/ann_benchmarks/results.py +++ b/ann_benchmarks/results.py @@ -19,7 +19,7 @@ def get_result_filename(dataset=None, count=None, definition=None, data = definition.arguments + query_arguments for i in range(len(data)): if isinstance(data[i], dict): - data[i] = {k:data[i][k] for k in data[i] if data[i][k] is not None} + data[i] = {k:data[i][k] for k in data[i] if data[i][k] is not None and k != 'auth'} data.append('client') data.append(id) d.append(re.sub(r'\W+', '_', json.dumps(data, sort_keys=True)).strip('_') + ".hdf5") From 2f3000fcde35e81cd01dd2df5b6393da03e44346 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 9 Mar 2023 17:00:16 +0200 Subject: [PATCH 71/77] skipping using multiprocessing when `parallelism == 1` --- ann_benchmarks/main.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/ann_benchmarks/main.py b/ann_benchmarks/main.py index 6e9064644..ab6bd6cbc 100644 --- a/ann_benchmarks/main.py +++ b/ann_benchmarks/main.py @@ -11,6 +11,7 @@ import shutil import sys import traceback +import time from ann_benchmarks.datasets import get_dataset, DATASETS from ann_benchmarks.constants import INDEX_DIR @@ -307,11 +308,18 @@ def _test(df): queue = multiprocessing.Queue() for definition in definitions: queue.put(definition) - if args.batch and args.parallelism > 1: - raise Exception(f"Batch mode uses all available CPU resources, --parallelism should be set to 1. (Was: {args.parallelism})") - workers = [multiprocessing.Process(target=run_worker, args=(i+1, args, queue)) - for i in range(args.parallelism)] - [worker.start() for worker in workers] - [worker.join() for worker in workers] + + if args.parallelism == 1: + # Wait for some jobs to be inserted into the queue + while queue.empty(): time.sleep(0.01) + # If we're only running one worker, then we can just run it in the same process + run_worker(1, args, queue) + else: + if args.batch: + raise Exception(f"Batch mode uses all available CPU resources, --parallelism should be set to 1. (Was: {args.parallelism})") + workers = [multiprocessing.Process(target=run_worker, args=(i+1, args, queue)) + for i in range(args.parallelism)] + [worker.start() for worker in workers] + [worker.join() for worker in workers] # TODO: need to figure out cleanup handling here From f9b542297078f1bbf7df86e5c447f5feaed52d3f Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Tue, 28 Mar 2023 18:27:18 +0300 Subject: [PATCH 72/77] added throughput metric (collect start and end time) --- ann_benchmarks/plotting/metrics.py | 12 ++++++++++++ ann_benchmarks/plotting/plot_variants.py | 1 + ann_benchmarks/runner.py | 5 +++++ multirun.py | 2 ++ 4 files changed, 20 insertions(+) diff --git a/ann_benchmarks/plotting/metrics.py b/ann_benchmarks/plotting/metrics.py index 0702bd317..f572a43a6 100644 --- a/ann_benchmarks/plotting/metrics.py +++ b/ann_benchmarks/plotting/metrics.py @@ -76,6 +76,13 @@ def rel(dataset_distances, run_distances, metrics): return metrics.attrs['rel'] +def throughput(queries, attrs): + try: + return (attrs['run_count'] * len(queries)) / (attrs["end_querying_time"] - attrs["start_querying_time"]) + except KeyError: + return 0 + + # actually qps per thread/connection def queries_per_second(queries, attrs): return 1.0 / attrs["best_search_time"] @@ -130,6 +137,11 @@ def dist_computations(queries, attrs): "function": lambda true_distances, run_distances, metrics, times, run_attrs: rel(true_distances, run_distances, metrics), # noqa "worst": float("inf") }, + "throughput": { + "description": "Index (Server) Throughput (qps over all threads/connections)", + "function": lambda true_distances, run_distances, metrics, times, run_attrs: throughput(true_distances, run_attrs), # noqa + "worst": float("-inf") + }, "qps": { "description": "Queries per second (1/s)", "function": lambda true_distances, run_distances, metrics, times, run_attrs: queries_per_second(true_distances, run_attrs), # noqa diff --git a/ann_benchmarks/plotting/plot_variants.py b/ann_benchmarks/plotting/plot_variants.py index a30d06dfd..fdae0192a 100644 --- a/ann_benchmarks/plotting/plot_variants.py +++ b/ann_benchmarks/plotting/plot_variants.py @@ -2,6 +2,7 @@ all_plot_variants = { "recall/time": ("k-nn", "qps"), + "recall/absolute_time": ("k-nn", "throughput"), "recall/buildtime": ("k-nn", "build"), "recall/indexsize": ("k-nn", "indexsize"), "recall/distcomps": ("k-nn", "distcomps"), diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index 89e41ac1d..7c96c6494 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -27,6 +27,8 @@ def run_individual_query(algo, X_train, X_test, distance, count, run_count, ((not batch) and hasattr(algo, "prepare_query")) best_search_time = float('inf') + start_time = time.time() # actual start time + end_time = start_time # "virtual" end time. actual end time is start_time + sum of query times for i in range(run_count): print('Run %d/%d...' % (i + 1, run_count)) # a bit dumb but can't be a scalar since of Python's scoping rules @@ -75,6 +77,7 @@ def batch_query(X): results = [single_query(x) for x in X_test] total_time = sum(time for time, _ in results) + end_time += total_time total_candidates = sum(len(candidates) for _, candidates in results) search_time = total_time / len(X_test) avg_candidates = total_candidates / len(X_test) @@ -85,6 +88,8 @@ def batch_query(X): attrs = { "batch_mode": batch, "best_search_time": best_search_time, + "start_querying_time": start_time, + "end_querying_time": end_time, "candidates": avg_candidates, "expect_extra": verbose, "name": str(algo), diff --git a/multirun.py b/multirun.py index b4fb58ab4..75cc7f871 100644 --- a/multirun.py +++ b/multirun.py @@ -40,6 +40,8 @@ def aggregate_outputs(files, clients): f.attrs[k] = v f.attrs["best_search_time"] = average([fi.attrs["best_search_time"] for fi in fs]) f.attrs["candidates"] = average([fi.attrs["candidates"] for fi in fs]) + f.attrs["start_querying_time"] = min([fi.attrs["start_querying_time"] for fi in fs]) + f.attrs["end_querying_time"] = max([fi.attrs["end_querying_time"] for fi in fs]) # As we split the test work between the clients, wee should concatenate their results f['times'] = [t for fi in fs for t in fi['times']] From 63d08d084eb400839b069bfd7b49c44b44e6f3f4 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Fri, 2 Jun 2023 14:19:55 +0100 Subject: [PATCH 73/77] Updated throughput tracking --- ann_benchmarks/plotting/metrics.py | 13 ++----------- ann_benchmarks/plotting/plot_variants.py | 1 - ann_benchmarks/runner.py | 3 +-- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/ann_benchmarks/plotting/metrics.py b/ann_benchmarks/plotting/metrics.py index f572a43a6..71e650b0b 100644 --- a/ann_benchmarks/plotting/metrics.py +++ b/ann_benchmarks/plotting/metrics.py @@ -76,15 +76,11 @@ def rel(dataset_distances, run_distances, metrics): return metrics.attrs['rel'] -def throughput(queries, attrs): +def queries_per_second(queries, attrs): try: return (attrs['run_count'] * len(queries)) / (attrs["end_querying_time"] - attrs["start_querying_time"]) except KeyError: - return 0 - - # actually qps per thread/connection -def queries_per_second(queries, attrs): - return 1.0 / attrs["best_search_time"] + return 1.0 / attrs["best_search_time"] def percentile_50(times): return np.percentile(times, 50.0) * 1000.0 @@ -137,11 +133,6 @@ def dist_computations(queries, attrs): "function": lambda true_distances, run_distances, metrics, times, run_attrs: rel(true_distances, run_distances, metrics), # noqa "worst": float("inf") }, - "throughput": { - "description": "Index (Server) Throughput (qps over all threads/connections)", - "function": lambda true_distances, run_distances, metrics, times, run_attrs: throughput(true_distances, run_attrs), # noqa - "worst": float("-inf") - }, "qps": { "description": "Queries per second (1/s)", "function": lambda true_distances, run_distances, metrics, times, run_attrs: queries_per_second(true_distances, run_attrs), # noqa diff --git a/ann_benchmarks/plotting/plot_variants.py b/ann_benchmarks/plotting/plot_variants.py index fdae0192a..a30d06dfd 100644 --- a/ann_benchmarks/plotting/plot_variants.py +++ b/ann_benchmarks/plotting/plot_variants.py @@ -2,7 +2,6 @@ all_plot_variants = { "recall/time": ("k-nn", "qps"), - "recall/absolute_time": ("k-nn", "throughput"), "recall/buildtime": ("k-nn", "build"), "recall/indexsize": ("k-nn", "indexsize"), "recall/distcomps": ("k-nn", "distcomps"), diff --git a/ann_benchmarks/runner.py b/ann_benchmarks/runner.py index 7c96c6494..0e6bb53d0 100644 --- a/ann_benchmarks/runner.py +++ b/ann_benchmarks/runner.py @@ -75,9 +75,8 @@ def batch_query(X): results = batch_query(X_test) else: results = [single_query(x) for x in X_test] - + end_time = time.time() total_time = sum(time for time, _ in results) - end_time += total_time total_candidates = sum(len(candidates) for _, candidates in results) search_time = total_time / len(X_test) avg_candidates = total_candidates / len(X_test) From 9d31f9f049dafebd579dcc45dead3d068a5d8565 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Mon, 12 Jun 2023 00:13:26 +0100 Subject: [PATCH 74/77] Fixed algorithm str when using FLAT on redisearch --- ann_benchmarks/algorithms/redisearch.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index f37030139..137ac2fb6 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -15,6 +15,7 @@ def __init__(self, algo, metric, conn_params, method_param): self.name = 'redisearch-%s (%s)' % (self.algo, self.method_param) self.index_name = "ann_benchmark" self.text = None + self.ef = None redis = RedisCluster if conn_params['cluster'] else Redis host = conn_params["host"] if conn_params["host"] else 'localhost' @@ -87,4 +88,7 @@ def freeIndex(self): self.redis.execute_command("FLUSHALL") def __str__(self): - return self.name + f", efRuntime: {self.ef}" + res = self.name + if self.ef is not None: + res += + f", efRuntime: {self.ef}" + return res From fb02421d4266bfaf37a80d4d346379b23a2e2661 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Fri, 16 Jun 2023 15:12:18 +0100 Subject: [PATCH 75/77] Ensure all primaries receive the FT.CREATE due to 'missing index error on querying' --- ann_benchmarks/algorithms/redisearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index f37030139..eb90216c1 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -37,7 +37,7 @@ def fit(self, X, offset=0, limit=None, hybrid_buckets = None): elif self.algo == "FLAT": args.extend(['vector', 'VECTOR', self.algo, '6', 'TYPE', 'FLOAT32', 'DIM', len(X[0]), 'DISTANCE_METRIC', self.metric]) print("Calling FT.CREATE", *args) - self.redis.execute_command('FT.CREATE', *args, target_nodes='random') + self.redis.execute_command('FT.CREATE', *args, target_nodes='primaries') except Exception as e: if 'Index already exists' not in str(e): raise From 166d1f49afe10d0d88770171af37503eb6089c93 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Sat, 17 Jun 2023 18:54:32 +0100 Subject: [PATCH 76/77] Fix unary typo on redisearch __str__ --- ann_benchmarks/algorithms/redisearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index 0be3e0d73..29ba5af3f 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -90,5 +90,5 @@ def freeIndex(self): def __str__(self): res = self.name if self.ef is not None: - res += + f", efRuntime: {self.ef}" + res += f", efRuntime: {self.ef}" return res From cc36963349af64f86d32886da2cd15a16ea2c95d Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Sun, 18 Jun 2023 20:35:59 +0100 Subject: [PATCH 77/77] disable query timeout on redisearch --- ann_benchmarks/algorithms/redisearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ann_benchmarks/algorithms/redisearch.py b/ann_benchmarks/algorithms/redisearch.py index 29ba5af3f..c1f3ea201 100644 --- a/ann_benchmarks/algorithms/redisearch.py +++ b/ann_benchmarks/algorithms/redisearch.py @@ -81,7 +81,7 @@ def query(self, v, k): vq = f'(@t:{self.text})=>[KNN {k} @vector $BLOB {qparams}]' else: vq = f'*=>[KNN {k} @vector $BLOB {qparams}]' - q = ['FT.SEARCH', self.index_name, vq, 'NOCONTENT', 'SORTBY', '__vector_score', 'LIMIT', '0', str(k), 'PARAMS', '2', 'BLOB', v.tobytes(), 'DIALECT', '2'] + q = ['FT.SEARCH', self.index_name, vq, 'NOCONTENT', 'SORTBY', '__vector_score', 'LIMIT', '0', str(k), 'PARAMS', '2', 'BLOB', v.tobytes(), 'DIALECT', '2', 'TIMEOUT', '0'] return [int(doc) for doc in self.redis.execute_command(*q, target_nodes='random')[1:]] def freeIndex(self):