Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
Add Couchbase Reader (#892)
Browse files Browse the repository at this point in the history
* Add the initial version of Couchbase documents reader

* Update the Readme

* Add doc strings

* Update the examples link

---------

Co-authored-by: Nithish Raghunandanan <[email protected]>
  • Loading branch information
nithishr and nithishr authored Jan 27, 2024
1 parent 561f37d commit fb4d2fb
Show file tree
Hide file tree
Showing 5 changed files with 180 additions and 1 deletion.
56 changes: 56 additions & 0 deletions llama_hub/couchbase/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Couchbase Loader

This loader loads documents from Couchbase cluster.
The user specifies a Couchbase client or credentials to initialize the reader. They can specify the SQL++ query to
fetch the relevant docs.

## Usage

Here's an example usage of the CouchbaseReader.

```python
from llama_index import download_loader
import os

CouchbaseLoader = download_loader('CouchbaseReader')

connection_string = "couchbase://localhost" # valid Couchbase connection string
db_username = "<valid_database_user_with_read_access_to_bucket_with_data>"
db_password = "<password_for_database_user>"

# query is a valid SQL++ query that is passed to client.query()
query = """
SELECT h.* FROM `travel-sample`.inventory.hotel h
WHERE h.country = 'United States'
LIMIT 5
"""

reader = CouchbaseLoader(
connection_string=connection_string,
db_username=db_username,
db_password=db_password
)

# It is also possible to pass an initialized Couchbase client to the document loader
# from couchbase.auth import PasswordAuthenticator # noqa: E402
# from couchbase.cluster import Cluster # noqa: E402
# from couchbase.options import ClusterOptions # noqa: E402

# auth = PasswordAuthenticator(
# db_username,
# db_password,
# )

# couchbase_client = Cluster(connection_string, ClusterOptions(auth))
# reader = CouchbaseLoader(client=couchbase_client)

# fields to be written to the document
text_fields=["name", "title", "address", "reviews"]

# metadata fields to be written to the document's metadata
metadata_fields=["country", "city"],

documents = reader.load_data(query=query, text_fields=text_fields, metadata_fields=metadata_fields)
```

This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/tree/main/llama_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/run-llama/llama-hub/tree/main) for examples.
6 changes: 6 additions & 0 deletions llama_hub/couchbase/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Init file."""
from llama_hub.couchbase.base import (
CouchbaseReader,
)

__all__ = ["CouchbaseReader"]
107 changes: 107 additions & 0 deletions llama_hub/couchbase/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""Couchbase document loader"""

from typing import Any, Iterable, List, Optional
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class CouchbaseReader(BaseReader):
"""Couchbase document loader.
Loads data from a Couchbase cluster into Document used by LlamaIndex.
Args:
client(Optional[Any]): A Couchbase client to use.
If not provided, the client will be created based on the connection_string
and database credentials.
connection_string (Optional[str]): The connection string to the Couchbase cluster.
db_username (Optional[str]): The username to connect to the Couchbase cluster.
db_password (Optional[str]): The password to connect to the Couchbase cluster.
"""

def __init__(
self,
client: Optional[Any] = None,
connection_string: Optional[str] = None,
db_username: Optional[str] = None,
db_password: Optional[str] = None,
) -> None:
"""Initialize Couchbase document loader."""
import_err_msg = "`couchbase` package not found, please run `pip install --upgrade couchbase`"
try:
from couchbase.auth import PasswordAuthenticator
from couchbase.cluster import Cluster
from couchbase.options import ClusterOptions
except ImportError:
raise ImportError(import_err_msg)

if not client:
if not connection_string or not db_username or not db_password:
raise ValueError(
"You need to pass either a couchbase client or connection_string and credentials must be provided."
)
else:
auth = PasswordAuthenticator(
db_username,
db_password,
)

self._client: Cluster = Cluster(connection_string, ClusterOptions(auth))
else:
self._client = client

def lazy_load_data(
self,
query: str,
text_fields: Optional[List[str]] = None,
metadata_fields: Optional[List[str]] = [],
) -> Iterable[Document]:
"""Load data from the Couchbase cluster lazily.
Args:
query (str): The SQL++ query to execute.
text_fields (Optional[List[str]]): The columns to write into the
`text` field of the document. By default, all columns are
written.
metadata_fields (Optional[List[str]]): The columns to write into the
`metadata` field of the document. By default, no columns are written.
"""
from datetime import timedelta

if not query:
raise ValueError("Query must be provided.")

# Ensure connection to Couchbase cluster
self._client.wait_until_ready(timedelta(seconds=5))

# Run SQL++ Query
result = self._client.query(query)
for row in result:
if not text_fields:
text_fields = list(row.keys())

metadata = {field: row[field] for field in metadata_fields}

document = "\n".join(
f"{k}: {v}" for k, v in row.items() if k in text_fields
)

yield (Document(text=document, metadata=metadata))

def load_data(
self,
query: str,
text_fields: Optional[List[str]] = None,
metadata_fields: Optional[List[str]] = None,
) -> List[Document]:
"""Load data from the Couchbase cluster.
Args:
query (str): The SQL++ query to execute.
text_fields (Optional[List[str]]): The columns to write into the
`text` field of the document. By default, all columns are
written.
metadata_fields (Optional[List[str]]): The columns to write into the
`metadata` field of the document. By default, no columns are written.
"""
return list(self.lazy_load_data(query, text_fields, metadata_fields))
1 change: 1 addition & 0 deletions llama_hub/couchbase/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
couchbase
11 changes: 10 additions & 1 deletion llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -1223,5 +1223,14 @@
"XMLReader": {
"id": "file/xml",
"author": "mmaatouk"
},
"CouchbaseReader": {
"id": "couchbase",
"author": "nithishr",
"keywords": [
"Couchbase",
"Capella",
"NoSQL"
]
}
}
}

0 comments on commit fb4d2fb

Please sign in to comment.