Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support edit metadata #414

Merged
merged 7 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 37 additions & 20 deletions lazyllm/tools/rag/doc_manager.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import os
import json
from typing import List, Optional, Dict
from typing import List, Optional, Dict, Union
from pydantic import BaseModel, Field
from typing import Dict, List, Any

from starlette.responses import RedirectResponse
from fastapi import UploadFile, Body
Expand Down Expand Up @@ -215,8 +214,8 @@ def delete_files_from_group(self, request: FileGroupRequest):
return BaseResponse(code=500, msg=str(e), data=None)

class AddMetadataRequest(BaseModel):
doc_ids: list
kv_pair: dict
doc_ids: List[str]
kv_pair: Dict[str, Union[bool, int, float, str, list]]

@app.post("/add_metadata")
def add_metadata(self, add_metadata_request: AddMetadataRequest):
Expand All @@ -230,47 +229,65 @@ def add_metadata(self, add_metadata_request: AddMetadataRequest):
for doc in docs:
meta_dict = json.loads(doc.meta) if doc.meta else {}
for k, v in kv_pair.items():
if k not in meta_dict:
if k not in meta_dict or not meta_dict[k]:
meta_dict[k] = v
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

此处不合理,假设我有两个标签a和b先后add,第一次add的时候会赋值成a,然后第二次add时就报错了

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我建议metadata加的时候要简单指定一下是不是list(取一个适合业务的名字,比如vector_values=True)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

出于简化设计和易于使用的考量。改为了这样的设计:
前提约束:metadata 内容是扁平的,即 {key: val} val 中不会出现嵌套的dict。

增加metadata时, kv_pair类型约束为: Dict[str, Union[bool, int, float, str, list]]

  1. 若k为新增:使用v直接赋值
  2. k已存在,若meta[k] 为list:将v中元素追加至meta[k]中
  3. k已存在,meta[k] 非list:将meta[k]变为list,并追加v中所有元素

elif isinstance(meta_dict[k], list):
meta_dict[k].append(v)
meta_dict[k].extend(v) if isinstance(v, list) else meta_dict[k].append(v)
else:
return BaseResponse(code=400, msg=f"Failed, {k} exists but value is not a list")
meta_dict[k] = [meta_dict[k]] + v if isinstance(v, list) else [meta_dict[k], v]
doc_meta[doc.doc_id] = meta_dict
self._manager.set_docs_new_meta(doc_meta)
return BaseResponse(data=None)
except Exception as e:
return BaseResponse(code=500, msg=str(e), data=None)

class DeleteMetadataRequest(BaseModel):
doc_ids: list
keys: Optional[List[str]] = Field(None)
doc_ids: List[str]
# value type should be None/list/str
kv_pair: Optional[Dict[str, Union[None, bool, int, float, str, list]]] = Field(None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

也同时保留原来keys的逻辑吧,并且判断一下keys和kv_pair有且只能有一个

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里稍微做了一下调整。 添加了keys 关键字,并且keys和kv_pair 可以 都存在/都不存在/存在一个。
功能效果如下:

  1. 都不存在,把所有用户设置的metadata key删除
  2. 只存在 keys,以key删除
  3. 只存在kv_pair, 以key+value 删除
  4. keys与kv_pair都存在。合并删除,即满足条件的key和kv_pair都会删除


def _inplace_del_meta(self, meta_dict, kv_pair: Dict[str, Union[None, bool, int, float, str, list]]):
# alert: meta_dict is not a deepcopy
for k, v in kv_pair.items():
if k not in meta_dict:
continue
if v is None:
meta_dict.pop(k, None)
elif isinstance(meta_dict[k], list):
if isinstance(v, (bool, int, float, str)):
v = [v]
# delete v exists in meta_dict[k]
meta_dict[k] = list(set(meta_dict[k]) - set(v))
else:
# old meta[k] not a list, use v as condition to delete the key
if meta_dict[k] == v:
meta_dict.pop(k, None)

@app.post("/delete_metadata_keys")
def delete_metadata_keys(self, del_metadata_request: DeleteMetadataRequest):
@app.post("/delete_metadata_item")
def delete_metadata_item(self, del_metadata_request: DeleteMetadataRequest):
doc_ids = del_metadata_request.doc_ids
keys = del_metadata_request.keys
kv_pair = del_metadata_request.kv_pair
try:
if not keys:
self._manager.set_docs_new_meta({doc_id:{} for doc_id in doc_ids})
if not kv_pair:
# clear metadata
self._manager.set_docs_new_meta({doc_id: {} for doc_id in doc_ids})
else:
docs = self._manager.get_docs(doc_ids)
if not docs:
return BaseResponse(code=400, msg="Failed, no doc found")
doc_meta = {}
for doc in docs:
meta_dict = json.loads(doc.meta) if doc.meta else {}
for key in keys:
meta_dict.pop(key, None)
is_success, msg = self._inplace_del_meta(meta_dict, kv_pair)
doc_meta[doc.doc_id] = meta_dict
self._manager.set_docs_new_meta(doc_meta)
return BaseResponse(data=None)
except Exception as e:
return BaseResponse(code=500, msg=str(e), data=None)

class UpdateMetadataRequest(BaseModel):
doc_ids: list
kv_pair: dict
doc_ids: List[str]
kv_pair: Dict[str, Union[bool, int, float, str, list]]

@app.post("/update_or_create_metadata_keys")
def update_or_create_metadata_keys(self, update_metadata_request: UpdateMetadataRequest):
Expand All @@ -292,8 +309,8 @@ def update_or_create_metadata_keys(self, update_metadata_request: UpdateMetadata
return BaseResponse(code=500, msg=str(e), data=None)

class ResetMetadataRequest(BaseModel):
doc_ids: list
new_meta: dict
doc_ids: List[str]
new_meta: Dict[str, Union[bool, int, float, str, list]]

@app.post("/reset_metadata")
def reset_metadata(self, reset_metadata_request: ResetMetadataRequest):
Expand Down
2 changes: 1 addition & 1 deletion lazyllm/tools/rag/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def get_docs(self, doc_ids: List[str]) -> List[KBDocument]:
return []

def set_docs_new_meta(self, doc_meta: Dict[str, dict]):
data_to_update = [{"_doc_id": k, "_meta": json.dumps(v)} for k,v in doc_meta.items()]
data_to_update = [{"_doc_id": k, "_meta": json.dumps(v)} for k, v in doc_meta.items()]
with self._db_lock, self._Session() as session:
# Use sqlalchemy core bulk update
stmt = KBDocument.__table__.update().where(
Expand Down
2 changes: 1 addition & 1 deletion tests/basic_tests/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def test_delete_files_in_store(self):
nodes = self.doc_impl.store.get_nodes(LAZY_ROOT_NAME)
assert cur_meta_dict["title"] == "title2"

url = f'{self.doc_server_addr}/delete_metadata_keys'
url = f'{self.doc_server_addr}/delete_metadata_item'
response = httpx.post(url, json=dict(doc_ids=[test2_docid], keys=["signature"]))
assert response.status_code == 200 and response.json().get('code') == 200
time.sleep(20)
Expand Down
Loading