Skip to content

Commit

Permalink
feat: init repo with pgvecto.rs support (#1)
Browse files Browse the repository at this point in the history
* feat: init with pgvecto.rs

Signed-off-by: Keming <[email protected]>

* ignore py

Signed-off-by: Keming <[email protected]>

* re-arrange the structure, add pseudo reader

Signed-off-by: Keming <[email protected]>

* use vector type

Signed-off-by: Keming <[email protected]>

* re struct

Signed-off-by: Keming <[email protected]>

* basic func

Signed-off-by: Keming <[email protected]>

* fix cli

Signed-off-by: Keming <[email protected]>

* add tqdm

Signed-off-by: Keming <[email protected]>

* feat: add dataset downloader (#1)

* feat: add dataset downloader

Signed-off-by: Keming <[email protected]>

* fix insert memory usage

Signed-off-by: Keming <[email protected]>

---------

Signed-off-by: Keming <[email protected]>

* add links in readme

Signed-off-by: Keming <[email protected]>

---------

Signed-off-by: Keming <[email protected]>
  • Loading branch information
kemingy authored Jan 23, 2024
1 parent e527067 commit 2cb974a
Show file tree
Hide file tree
Showing 18 changed files with 926 additions and 0 deletions.
166 changes: 166 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

.ruff_cache/
.csv
.json
.hdf5
datasets/
20 changes: 20 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
PY_SOURCE=vector_bench

dev:
@pip install -e .[dev]

lint:
@ruff check ${PY_SOURCE}

format:
@ruff check --fix ${PY_SOURCE}
@ruff format ${PY_SOURCE}

clean:
@-rm -rf dist build __pycache__ *.egg-info argstruct/__version__.py

build:
@python -m build

test:
@pytest -v tests
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,32 @@
# Vector DB Benchmark

Supported databases/extensions:

- [x] [`pgvecto.rs`](https://github.com/tensorchord/pgvecto.rs)
- [ ] [`pgvector`](https://github.com/pgvector/pgvector)
- [ ] [`qdrant`](https://github.com/qdrant/qdrant/)

Supported datasets:

- [x] random generated
- [x] GIST 960


## Installation

```bash
pip install vector_bench[pgvectors]
```

## Run

```bash
# help
vector_bench --help
# only insert the data
vector_bench --insert --url postgresql://postgres:[email protected]:5432/postgres -s gist_960_l2
# only query the data (make sure the data is already inserted)
vector_bench --query --url postgresql://postgres:password@localhost:5432/postgres -s gist_960_l2
# insert and query the data
vector_bench --insert --query --url postgresql://postgres:password@localhost:5432/postgres -s gist_960_l2
```
41 changes: 41 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
[project]
name = "vectordb-benchmark"
dynamic = ["version"]
requires-python = ">=3.8"
readme = "README.md"
license = {text = "Apache-2.0"}
dependencies = [
"msgspec~=0.18.5",
"h5py~=3.10.0",
"numpy~=1.26.3",
"tqdm~=4.66",
"httpx",
]
[project.optional-dependencies]
dev = [
"ruff~=0.1.3",
"pytest~=7.4",
]
pgvectors = [
"psycopg[binary]",
]
[project.urls]
"Homepage" = "https://github.com/tensorchord/vectordb-benchmark"
[build-system]
requires = ["setuptools", "setuptools_scm>=7.0"]
build-backend = "setuptools.build_meta"
[project.scripts]
"vector_bench" = "vector_bench.main:main"

[tool.setuptools_scm]
fallback_version = "0.0.0"

[tool.ruff]
target-version = "py38"
[tool.ruff.lint]
select = ["E", "F", "G", "B", "I", "SIM", "TID", "PL", "RUF"]
ignore = ["E501"]
[tool.ruff.lint.isort]
known-first-party = ["argstruct"]
[tool.ruff.pylint]
max-args = 7
33 changes: 33 additions & 0 deletions vector_bench/args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from argparse import ArgumentParser

from vector_bench.dataset.source import DataSource


def build_arg_parser():
parser = ArgumentParser()
parser.add_argument(
"--client",
"-c",
choices=["pgvecto_rs", "pgvector", "qdrant"],
default="pgvecto_rs",
help="client type",
)
parser.add_argument(
"--worker-num",
"-w",
type=int,
help="number of workers, if not set, use min(32, cpu_thread + 4)",
)
parser.add_argument("--url", "-u", help="database url")
parser.add_argument(
"--source", "-s", choices=DataSource.list(), help="dataset source"
)
parser.add_argument("--query", action="store_true", help="query benchmark")
parser.add_argument("--insert", action="store_true", help="insert data")
return parser


if __name__ == "__main__":
parser = build_arg_parser()
args = parser.parse_args()
print(args)
93 changes: 93 additions & 0 deletions vector_bench/bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from concurrent.futures import CancelledError, ThreadPoolExecutor, as_completed
from itertools import islice
from time import perf_counter
from typing import Iterable, Optional

from vector_bench.client import DataBaseClient
from vector_bench.client.base import BaseClient
from vector_bench.dataset import DatasetReader
from vector_bench.dataset.base import BaseReader
from vector_bench.log import logger
from vector_bench.spec import BenchmarkResult, DatabaseConfig, DatasetConfig, Query


def batched(iterable: Iterable, n: int):
it = iter(iterable)
while batch := tuple(islice(it, n)):
yield batch


class Benchmark:
def __init__(
self,
db_config: DatabaseConfig,
dataset_config: DatasetConfig,
worker_num: Optional[int] = None,
) -> None:
self.client: BaseClient = DataBaseClient.select(db_config.name).from_config(
db_config
)
self.reader: BaseReader = DatasetReader.select(
dataset_config.type.value
).from_config(dataset_config)
self.worker_num: Optional[int] = worker_num
self.query_result: BenchmarkResult = BenchmarkResult()

def insert(self):
logger.info("inserting records...")
epoch_size, batch_size = 10000, 20
with ThreadPoolExecutor(self.worker_num) as executor:
logger.info("using %s executors", executor._max_workers)
for i, epoch in enumerate(batched(self.reader.read_record(), epoch_size)):
epoch_start = perf_counter()
for future in as_completed(
executor.submit(self.client.insert_batch, records)
for records in batched(epoch, batch_size)
):
try:
future.result()
except (CancelledError, TimeoutError) as err:
logger.exception("failed to insert records", exc_info=err)
logger.info(
"finished %s records with RPS(%.3f)",
(i + 1) * epoch_size,
epoch_size / (perf_counter() - epoch_start),
)

def _query_helper(self, query: Query):
start_time = perf_counter()
records = self.client.query(query.vector, len(query.expect_ids))
elapsed = perf_counter() - start_time
self.query_result.query += 1
self.query_result.latency.append(elapsed)
self.query_result.precision.append(
len(
set(query.expect_ids).intersection(set(record.id for record in records))
)
/ len(query.expect_ids)
)

def query(self) -> BenchmarkResult:
logger.info("querying...")
epoch_size = 100
with ThreadPoolExecutor(self.worker_num) as executor:
logger.info("using %s executors", executor._max_workers)
start = perf_counter()
for i, epoch in enumerate(batched(self.reader.read_query(), epoch_size)):
epoch_start = perf_counter()
for future in as_completed(
executor.submit(self._query_helper, query) for query in epoch
):
try:
future.result()
except (CancelledError, TimeoutError) as err:
logger.exception("failed to query", exc_info=err)
logger.info(
"finished %s queries with RPS (%.3f)",
(i + 1) * epoch_size,
epoch_size / (perf_counter() - epoch_start),
)
self.query_result.total_second = perf_counter() - start
self.query_result.worker_num = executor._max_workers

return self.query_result
6 changes: 6 additions & 0 deletions vector_bench/client/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from vector_bench.client.pgvecto_rs import PgVectorsClient
from vector_bench.spec import EnumSelector


class DataBaseClient(EnumSelector):
PGVECTO_RS = PgVectorsClient
Loading

0 comments on commit 2cb974a

Please sign in to comment.