From 6274740fa92de5a38d9c5bd49a35b53e342cfb94 Mon Sep 17 00:00:00 2001 From: imhuwq Date: Tue, 12 Mar 2024 16:23:03 +0800 Subject: [PATCH 1/3] feature(images api): support random visit --- deepdataspace/plugins/coco2017/importer.py | 7 +- .../server/resources/api_v1/images.py | 127 +++++++++++------- 2 files changed, 85 insertions(+), 49 deletions(-) diff --git a/deepdataspace/plugins/coco2017/importer.py b/deepdataspace/plugins/coco2017/importer.py index 4016118..0902bf6 100644 --- a/deepdataspace/plugins/coco2017/importer.py +++ b/deepdataspace/plugins/coco2017/importer.py @@ -34,6 +34,8 @@ def __init__(self, meta_path: str, enforce: bool = False): info = self.parse_meta(meta_path) if info is None: raise RuntimeError(f"Cannot import coco dataset: {meta_path}") + else: + logger.info(f"Successfully parsed meta file {meta_path}: {info}") dataset_name = info["dataset_name"] self.ground_truth = info["ground_truth"] @@ -100,9 +102,8 @@ def parse_meta(meta_path: str): logger.error(traceback.format_exc()) logger.error(f"Failed to parse meta file {meta_path}: {err}") return None - - logger.info(f"Successfully parsed meta file {meta_path}: {info}") - return info + else: + return info def load_ground_truth(self): with open(self.ground_truth, "r", encoding="utf8") as fp: diff --git a/deepdataspace/server/resources/api_v1/images.py b/deepdataspace/server/resources/api_v1/images.py index acd303c..92dedbc 100644 --- a/deepdataspace/server/resources/api_v1/images.py +++ b/deepdataspace/server/resources/api_v1/images.py @@ -6,11 +6,12 @@ import json import logging +from random import randint from deepdataspace.constants import DatasetFileType from deepdataspace.constants import DatasetStatus +from deepdataspace.constants import DatasetType from deepdataspace.constants import ErrCode -from deepdataspace.constants import LabelType from deepdataspace.model import DataSet from deepdataspace.model.image import Image from deepdataspace.plugins.coco2017 import COCO2017Importer @@ -19,7 +20,6 @@ from deepdataspace.utils.http import format_response from deepdataspace.utils.http import parse_arguments from deepdataspace.utils.http import raise_exception -from deepdataspace.constants import DatasetType logger = logging.getLogger("django") @@ -68,9 +68,9 @@ class ImagesView(BaseAPIView): Argument("dataset_id", str, Argument.QUERY, required=True), Argument("category_id", str, Argument.QUERY, required=False), Argument("flag", int, Argument.QUERY, required=False), - Argument("label_id", str, Argument.QUERY, required=False), Argument("page_num", Argument.PositiveInt, Argument.QUERY, default=1), - Argument("page_size", Argument.PositiveInt, Argument.QUERY, default=100) + Argument("page_size", Argument.PositiveInt, Argument.QUERY, default=100), + Argument("offset", int, Argument.QUERY, required=False, default=None), ] def get(self, request): @@ -79,7 +79,7 @@ def get(self, request): - GET /api/v1/images """ - dataset_id, category_id, flag, label_id, page_num, page_size = parse_arguments(request, self.get_args) + dataset_id, category_id, flag, page_num, page_size, offset = parse_arguments(request, self.get_args) dataset = DataSet.find_one({"_id": dataset_id}) if dataset is None: @@ -92,73 +92,108 @@ def get(self, request): filters = {} if category_id is not None: - filters = {"objects": { - "$elemMatch": { - "category_id": category_id, - "label_type" : {"$in": [LabelType.User, LabelType.GroundTruth]}}} - } + filters["objects.category_id"] = category_id if flag is not None: filters["flag"] = flag total = Image(dataset_id).count_num(filters) - image_list = [] - offset = max(0, page_size * (page_num - 1)) + if offset is None: + skip = max(0, page_size * (page_num - 1)) + else: + skip = 0 + page_num = None + if offset == -1: # generate a random offset + includes = {"_id": 1, "idx": 1} + max_idx = Image(dataset_id).find_many(filters, includes, + sort=[("idx", -1)], + skip=0, size=1, + to_dict=True) + max_idx = list(max_idx)[0]["idx"] + + min_idx = Image(dataset_id).find_many(filters, includes, + sort=[("idx", 1)], + skip=0, size=1, + to_dict=True) + min_idx = list(min_idx)[0]["idx"] + + offset = randint(min_idx, max_idx) + + # try the best to return at least page_size objects + if max_idx - offset + 1 < page_size: + offset = max(min_idx, max_idx - page_size + 1) + filters["idx"] = {"$gte": offset} + elif offset >= 0: # query by specified offset + filters["idx"] = {"$gte": offset} + else: + raise_exception(ErrCode.BadRequest, f"invalid offset value[{offset}]") + + if skip > total: + data = { + "image_list": [], + "offset" : offset, + "page_size" : page_size, + "page_num" : page_num, + "total" : total + } + return format_response(data, enable_cache=True) - includes = {"id", "idx", "flag", "objects", "metadata", "type", "width", "height", "url", - "url_full_res"} + includes = {"id", "idx", "flag", "objects", "metadata", + "type", "width", "height", "url", "url_full_res"} includes = {i: 1 for i in includes} req_scheme = request.scheme req_host = request.META["HTTP_HOST"] req_prefix = f"{req_scheme}://{req_host}" - if offset <= total: - for image in Image(dataset_id).find_many(filters, includes, - sort=[("idx", 1)], - skip=offset, - size=page_size, - to_dict=True): - for obj in image["objects"]: - obj["source"] = obj["label_type"] # TODO keep for compatibility, delete this in the future + image_list = [] + for image in Image(dataset_id).find_many(filters, + includes, + sort=[("idx", 1)], + skip=skip, + size=page_size, + to_dict=True): + for obj in image["objects"]: + obj["source"] = obj["label_type"] # TODO keep for compatibility, delete this in the future - alpha = obj.get("alpha", "") - if alpha is None: - obj["alpha"] = "" - elif not alpha.startswith("http"): - obj["alpha"] = f"{req_prefix}{alpha}" + alpha = obj.get("alpha", "") + if alpha is None: + obj["alpha"] = "" + elif not alpha.startswith("http"): + obj["alpha"] = f"{req_prefix}{alpha}" - if obj["segmentation"] is None: - obj["segmentation"] = "" + if obj["segmentation"] is None: + obj["segmentation"] = "" - obj["caption"] = obj["caption"] or "" + obj["caption"] = obj["caption"] or "" - obj.pop("compare_result", None) + obj.pop("compare_result", None) - image_url = image["url"] - image_url = concat_url(req_prefix, image_url) + image_url = image["url"] + image_url = concat_url(req_prefix, image_url) - image_url_full_res = image["url_full_res"] or image_url - image_url_full_res = concat_url(req_prefix, image_url_full_res) + image_url_full_res = image["url_full_res"] or image_url + image_url_full_res = concat_url(req_prefix, image_url_full_res) - desc = image.pop("metadata") or "{}" + desc = image.pop("metadata") or "{}" - image.update({ - "desc" : desc, - "metadata" : json.loads(desc), - "url" : image_url, - "url_full_res": image_url_full_res - }) + image.update({ + "desc" : desc, + "metadata" : json.loads(desc), + "url" : image_url, + "url_full_res": image_url_full_res + }) - image["caption"] = "" - if caption_generator: - image["caption"] = caption_generator(image) + image["caption"] = "" + if caption_generator: + image["caption"] = caption_generator(image) - image_list.append(image) + image_list.append(image) data = { "image_list": image_list, + "offset" : offset, "page_size" : page_size, "page_num" : page_num, "total" : total From 3ad9706d97de0902da74d1a7baefcb4dff23ccd1 Mon Sep 17 00:00:00 2001 From: imhuwq Date: Tue, 12 Mar 2024 16:27:11 +0800 Subject: [PATCH 2/3] feature(importer): create index on objects.category_id and idx in importer.post_run --- deepdataspace/io/importer.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/deepdataspace/io/importer.py b/deepdataspace/io/importer.py index d30c5e5..e758eba 100644 --- a/deepdataspace/io/importer.py +++ b/deepdataspace/io/importer.py @@ -17,8 +17,8 @@ from typing import Type from typing import Union -from tqdm import tqdm from pymongo import WriteConcern +from tqdm import tqdm from deepdataspace import constants from deepdataspace.constants import AnnotationType @@ -325,12 +325,21 @@ def pre_run(self): def post_run(self): """ - A post-run hook for subclass importers to clean up data. + A post-run hook for subclass importers. """ self.dataset.add_cover() DataSet.update_one({"id": self.dataset.id}, {"status": DatasetStatus.Ready}) self.dataset = DataSet.find_one({"id": self.dataset.id}) + dataset_id = self.dataset.id + Image(dataset_id).get_collection().create_index([ + ("objects.category_id", 1), + ]) + + Image(dataset_id).get_collection().create_index([ + ("idx", 1) + ]) + def on_error(self, err: Exception): """ A hook to handle error. @@ -348,13 +357,13 @@ def load_existing_user_data(self): """ pipeline = [ - {"$project": {"flag": 1, + {"$project": {"flag" : 1, "flag_ts": 1, "objects": { "$filter": { "input": "$objects", - "as": "object", - "cond": { + "as" : "object", + "cond" : { "$eq": ["$$object.label_type", LabelType.User] } } @@ -374,7 +383,7 @@ def load_existing_user_data(self): self._user_data[image_id] = { "objects": user_objects, - "flag": flag, + "flag" : flag, "flag_ts": flag_ts, } @@ -400,7 +409,7 @@ def run_import(self): desc = f"dataset[{self.dataset.name}@{self.dataset.id}] import progress" for (image, anno_list) in tqdm(self, desc=desc, unit=" images"): - # for (image, anno_list) in self: + # for (image, anno_list) in self: image = self.dataset_import_image(self.dataset, **image) self.image_add_user_data(image) for anno in anno_list: From 10c1939ac83b2074ab549365c557b927a088da8f Mon Sep 17 00:00:00 2001 From: imhuwq Date: Tue, 12 Mar 2024 16:32:40 +0800 Subject: [PATCH 3/3] refactor(Importer): refactor post_run internal steps --- deepdataspace/io/importer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/deepdataspace/io/importer.py b/deepdataspace/io/importer.py index e758eba..057ba99 100644 --- a/deepdataspace/io/importer.py +++ b/deepdataspace/io/importer.py @@ -327,10 +327,10 @@ def post_run(self): """ A post-run hook for subclass importers. """ + logger.info(f"Add cover to dataset [{self.dataset.name}]@[{self.dataset.id}]") self.dataset.add_cover() - DataSet.update_one({"id": self.dataset.id}, {"status": DatasetStatus.Ready}) - self.dataset = DataSet.find_one({"id": self.dataset.id}) + logger.info(f"Add indices to dataset [{self.dataset.name}]@[{self.dataset.id}]") dataset_id = self.dataset.id Image(dataset_id).get_collection().create_index([ ("objects.category_id", 1), @@ -340,6 +340,10 @@ def post_run(self): ("idx", 1) ]) + logger.info(f"Set status ready for dataset [{self.dataset.name}]@[{self.dataset.id}]") + DataSet.update_one({"id": self.dataset.id}, {"status": DatasetStatus.Ready}) + self.dataset = DataSet.find_one({"id": self.dataset.id}) + def on_error(self, err: Exception): """ A hook to handle error.