From 9dfbced0484e00db66a406549330f963271c77e3 Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Wed, 12 Feb 2025 18:22:04 +0400 Subject: [PATCH 1/4] do not keep all annotations in RAM on backup --- cvat/apps/engine/backup.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 49544d90aa77..0c4d3ee1ee42 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: MIT import io +import json import mimetypes import os import re @@ -21,11 +22,13 @@ from zipfile import ZipFile import django_rq +import json_stream from django.conf import settings from django.core.exceptions import ObjectDoesNotExist from django.db import transaction from django.utils import timezone from rest_framework import serializers, status +from rest_framework.compat import SHORT_SEPARATORS from rest_framework.exceptions import ValidationError from rest_framework.parsers import JSONParser from rest_framework.renderers import JSONRenderer @@ -596,22 +599,24 @@ def serialize_data(): target_manifest_file = os.path.join(target_dir, self.MANIFEST_FILENAME) if target_dir else self.MANIFEST_FILENAME zip_object.writestr(target_manifest_file, data=JSONRenderer().render(task)) - def _write_annotations(self, zip_object, target_dir=None): + def _write_annotations(self, zip_object: ZipFile, target_dir: Optional[str] = None) -> None: + @json_stream.streamable_list def serialize_annotations(): - job_annotations = [] db_jobs = self._get_db_jobs() db_job_ids = (j.id for j in db_jobs) for db_job_id in db_job_ids: annotations = dm.task.get_job_data(db_job_id) annotations_serializer = LabeledDataSerializer(data=annotations) annotations_serializer.is_valid(raise_exception=True) - job_annotations.append(self._prepare_annotations(annotations_serializer.data, self._label_mapping)) - - return job_annotations + yield self._prepare_annotations(annotations_serializer.data, self._label_mapping) annotations = serialize_annotations() target_annotations_file = os.path.join(target_dir, self.ANNOTATIONS_FILENAME) if target_dir else self.ANNOTATIONS_FILENAME - zip_object.writestr(target_annotations_file, data=JSONRenderer().render(annotations)) + with TmpDirManager.get_tmp_directory() as temp_dir: + tmp_json_file = os.path.join(temp_dir, "tmp.json") + with open(tmp_json_file, 'w') as f: + json.dump(annotations, f, separators=SHORT_SEPARATORS) + zip_object.write(tmp_json_file, arcname=target_annotations_file) def _export_task(self, zip_obj, target_dir=None): self._write_data(zip_obj, target_dir) From c48d228d9d26572c15017f0c54f824ab915dc1c3 Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Thu, 13 Feb 2025 01:41:16 +0400 Subject: [PATCH 2/4] better streaming for annotations --- cvat/apps/engine/backup.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 0c4d3ee1ee42..927689f3b295 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -3,6 +3,7 @@ # # SPDX-License-Identifier: MIT +import encodings import io import json import mimetypes @@ -28,7 +29,6 @@ from django.db import transaction from django.utils import timezone from rest_framework import serializers, status -from rest_framework.compat import SHORT_SEPARATORS from rest_framework.exceptions import ValidationError from rest_framework.parsers import JSONParser from rest_framework.renderers import JSONRenderer @@ -612,11 +612,8 @@ def serialize_annotations(): annotations = serialize_annotations() target_annotations_file = os.path.join(target_dir, self.ANNOTATIONS_FILENAME) if target_dir else self.ANNOTATIONS_FILENAME - with TmpDirManager.get_tmp_directory() as temp_dir: - tmp_json_file = os.path.join(temp_dir, "tmp.json") - with open(tmp_json_file, 'w') as f: - json.dump(annotations, f, separators=SHORT_SEPARATORS) - zip_object.write(tmp_json_file, arcname=target_annotations_file) + with zip_object.open(target_annotations_file, 'w') as f: + json.dump(annotations, encodings.utf_8.StreamWriter(f), separators=(',', ':')) def _export_task(self, zip_obj, target_dir=None): self._write_data(zip_obj, target_dir) From 8b45d0b30778d56eaa6f411008337ed8b22a63c2 Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Thu, 13 Feb 2025 11:13:39 +0400 Subject: [PATCH 3/4] updating requirements --- cvat/requirements/base.in | 1 + cvat/requirements/base.txt | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/cvat/requirements/base.in b/cvat/requirements/base.in index 1172778f8baa..c812da5440b3 100644 --- a/cvat/requirements/base.in +++ b/cvat/requirements/base.in @@ -57,3 +57,4 @@ rq==1.16.0 rules>=3.3 Shapely==1.7.1 xmlsec>=1.3.14,<2 +json-stream>=2.0 diff --git a/cvat/requirements/base.txt b/cvat/requirements/base.txt index feb15d9183ce..c663053ff5ab 100644 --- a/cvat/requirements/base.txt +++ b/cvat/requirements/base.txt @@ -1,4 +1,4 @@ -# SHA1:9c45ee6ba604552349bcaf41a8f35abbc7c62ddd +# SHA1:02cd495ccf64874404b603b505a50d84acc316cc # # This file is autogenerated by pip-compile-multi # To update, run: @@ -29,7 +29,7 @@ botocore==1.20.112 # s3transfer cachetools==5.5.1 # via google-auth -certifi==2024.12.14 +certifi==2025.1.31 # via # clickhouse-connect # msrest @@ -52,7 +52,7 @@ coreschema==0.0.4 # via coreapi crontab==1.0.1 # via rq-scheduler -cryptography==44.0.0 +cryptography==44.0.1 # via # azure-storage-blob # datumaro @@ -71,7 +71,7 @@ dj-pagination==2.5.0 # via -r cvat/requirements/base.in dj-rest-auth[with-social]==5.0.2 # via -r cvat/requirements/base.in -django==4.2.18 +django==4.2.19 # via # -r cvat/requirements/base.in # dj-rest-auth @@ -119,7 +119,7 @@ easyprocess==1.1 # via pyunpack entrypoint2==1.1 # via pyunpack -fonttools==4.55.8 +fonttools==4.56.0 # via matplotlib freezegun==1.5.1 # via rq-scheduler @@ -142,7 +142,7 @@ google-crc32c==1.6.0 # via google-resumable-media google-resumable-media==2.7.2 # via google-cloud-storage -googleapis-common-protos==1.66.0 +googleapis-common-protos==1.67.0 # via google-api-core h5py==3.12.1 # via datumaro @@ -175,7 +175,9 @@ joblib==1.4.2 # nltk # scikit-learn json-stream==2.3.3 - # via datumaro + # via + # -r cvat/requirements/base.in + # datumaro json-stream-rs-tokenizer==0.4.27 # via json-stream jsonschema==4.17.3 @@ -184,7 +186,7 @@ kiwisolver==1.4.7 # via matplotlib limits==4.0.1 # via python-logstash-async -lxml==5.3.0 +lxml==5.3.1 # via # -r cvat/requirements/base.in # datumaro @@ -285,7 +287,7 @@ python3-openid==3.2.0 # via django-allauth python3-saml==1.16.0 # via django-allauth -pytz==2024.2 +pytz==2025.1 # via # clickhouse-connect # pandas From de6d3a138d49b6d10e907342e4d11bed35bdaf85 Mon Sep 17 00:00:00 2001 From: Dmitrii Lavrukhin Date: Thu, 13 Feb 2025 16:17:30 +0400 Subject: [PATCH 4/4] fixes --- cvat/apps/engine/backup.py | 4 ++-- cvat/requirements/base.in | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py index 927689f3b295..87f9244fc537 100644 --- a/cvat/apps/engine/backup.py +++ b/cvat/apps/engine/backup.py @@ -3,7 +3,7 @@ # # SPDX-License-Identifier: MIT -import encodings +import codecs import io import json import mimetypes @@ -613,7 +613,7 @@ def serialize_annotations(): annotations = serialize_annotations() target_annotations_file = os.path.join(target_dir, self.ANNOTATIONS_FILENAME) if target_dir else self.ANNOTATIONS_FILENAME with zip_object.open(target_annotations_file, 'w') as f: - json.dump(annotations, encodings.utf_8.StreamWriter(f), separators=(',', ':')) + json.dump(annotations, codecs.getwriter('utf-8')(f), separators=(',', ':')) def _export_task(self, zip_obj, target_dir=None): self._write_data(zip_obj, target_dir) diff --git a/cvat/requirements/base.in b/cvat/requirements/base.in index c812da5440b3..380e066d8893 100644 --- a/cvat/requirements/base.in +++ b/cvat/requirements/base.in @@ -33,6 +33,7 @@ djangorestframework>=3.15.2,<4 drf-spectacular==0.26.2 furl==2.1.0 google-cloud-storage==1.42.0 +json-stream>=2.0 lxml>=5.2.1,<6 natsort==8.0.0 numpy~=1.22.2 @@ -57,4 +58,3 @@ rq==1.16.0 rules>=3.3 Shapely==1.7.1 xmlsec>=1.3.14,<2 -json-stream>=2.0