diff --git a/.github/workflows/test_labeling.yaml b/.github/workflows/test_labeling.yaml index 25a8126c..f5474cfa 100644 --- a/.github/workflows/test_labeling.yaml +++ b/.github/workflows/test_labeling.yaml @@ -31,4 +31,4 @@ jobs: python-version: '3.9' - name: Run tests for labeling run: | - test="true" docker-compose -f labeling/docker-compose.yml up --build --exit-code-from test + test="true" docker compose -f labeling/docker-compose.yml up --build --exit-code-from test diff --git a/.github/workflows/test_on_push.yaml b/.github/workflows/test_on_push.yaml index d46ad036..a66acbde 100644 --- a/.github/workflows/test_on_push.yaml +++ b/.github/workflows/test_on_push.yaml @@ -36,4 +36,4 @@ jobs: flake8 . - name: Run tests run: | - test="true" docker-compose up --build --exit-code-from test + test="true" docker compose up --build --exit-code-from test diff --git a/README.md b/README.md index bbdc18af..bb2cba6e 100644 --- a/README.md +++ b/README.md @@ -136,12 +136,12 @@ cd dedoc ### 3. Build the image and run the application ```shell -docker-compose up --build +docker compose up --build ``` ### 4. Run container with tests ```shell -test="true" docker-compose up --build +test="true" docker compose up --build ``` If you need to change some application settings, you may update `config.py` according to your needs and re-build the image. diff --git a/dedoc/download_models.py b/dedoc/download_models.py index fa209b14..64902fa7 100644 --- a/dedoc/download_models.py +++ b/dedoc/download_models.py @@ -6,7 +6,7 @@ """ model_hash_dict = dict( txtlayer_classifier="9ca1de749d8d37147b00a3a228e03ee1776c695f", - scan_orientation_efficient_net_b0="9ea283f3d346ae4fdd82463a9f60b5369a3ffb58", + scan_orientation_efficient_net_b0="c60812552a1be624476c1e5b58599867b36f8d4e", font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07", paragraph_classifier="c26a10193499d3cbc77ffec9842bece24fa8950b", line_type_classifiers="0568c6e1f49612c0c351f10b80a26dc05f796683", diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py index bcd1fd68..a76a9d55 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py @@ -1,4 +1,5 @@ import logging +import os import warnings from os import path from typing import Optional, Tuple @@ -30,11 +31,9 @@ def __init__(self, on_gpu: bool, checkpoint_path: Optional[str], *, config: dict @property def net(self) -> ClassificationModelTorch: if self._net is None: + net = ClassificationModelTorch(self.checkpoint_path) if self.checkpoint_path is not None: - net = ClassificationModelTorch(path.join(self.checkpoint_path, "scan_orientation_efficient_net_b0.pth")) self._load_weights(net) - else: - net = ClassificationModelTorch(None) self._net = net self._net.to(self.device) return self._net @@ -61,17 +60,18 @@ def _set_device(self, on_gpu: bool) -> None: self.logger.warning(f"Classifier is set to device {self.device}") def _load_weights(self, net: ClassificationModelTorch) -> None: - path_checkpoint = path.join(self.checkpoint_path, "scan_orientation_efficient_net_b0.pth") - if not path.isfile(path_checkpoint): - download_from_hub(out_dir=self.checkpoint_path, + if not path.isfile(self.checkpoint_path): + from dedoc.config import get_config + self.checkpoint_path = os.path.join(get_config()["resources_path"], "scan_orientation_efficient_net_b0.pth") + download_from_hub(out_dir=os.path.dirname(os.path.abspath(self.checkpoint_path)), out_name="scan_orientation_efficient_net_b0.pth", repo_name="scan_orientation_efficient_net_b0", hub_name="model.pth") with warnings.catch_warnings(): warnings.simplefilter("ignore") - net.load_state_dict(torch.load(path_checkpoint, map_location=self.location)) - self.logger.info(f"Weights were loaded from {path_checkpoint}") + net.load_state_dict(torch.load(self.checkpoint_path, map_location=self.location)) + self.logger.info(f"Weights were loaded from {self.checkpoint_path}") def save_weights(self, path_checkpoint: str) -> None: torch.save(self.net.state_dict(), path_checkpoint) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index 0a9a54a3..8b635ad7 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -1,3 +1,4 @@ +import os from typing import List, Optional, Tuple from numpy import ndarray @@ -46,7 +47,9 @@ def __init__(self, *, config: Optional[dict] = None) -> None: ) self.skew_corrector = SkewCorrector() self.column_orientation_classifier = ColumnsOrientationClassifier(on_gpu=self.config.get("on_gpu", False), - checkpoint_path=get_config()["resources_path"], config=self.config) + checkpoint_path=os.path.join(get_config()["resources_path"], + "scan_orientation_efficient_net_b0.pth"), + config=self.config) self.binarizer = AdaptiveBinarizer() self.ocr = OCRLineExtractor(config=self.config) diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index c6d068b4..faab0250 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -31,7 +31,7 @@ still, the docker application should be installed and configured properly. .. code-block:: bash - docker-compose up --build + docker compose up --build If you need to change some application settings, you may update ``config.py`` according to your needs and re-build the image. diff --git a/resources/benchmarks/orient_classifier_scores.txt b/resources/benchmarks/orient_classifier_scores.txt index 9fe55d01..c052e058 100644 --- a/resources/benchmarks/orient_classifier_scores.txt +++ b/resources/benchmarks/orient_classifier_scores.txt @@ -3,13 +3,13 @@ Orientation predictions: +-------+-----------+--------+-------+-------+ | Class | Precision | Recall | F1 | Count | +=======+===========+========+=======+=======+ -| 0 | 0.998 | 1 | 0.999 | 537 | +| 0 | 0.998 | 1 | 0.999 | 825 | +-------+-----------+--------+-------+-------+ -| 90 | 1 | 0.998 | 0.999 | 537 | +| 90 | 1 | 0.999 | 0.999 | 825 | +-------+-----------+--------+-------+-------+ -| 180 | 1 | 0.998 | 0.999 | 537 | +| 180 | 1 | 0.998 | 0.999 | 825 | +-------+-----------+--------+-------+-------+ -| 270 | 0.998 | 1 | 0.999 | 537 | +| 270 | 0.999 | 1 | 0.999 | 825 | +-------+-----------+--------+-------+-------+ | AVG | 0.999 | 0.999 | 0.999 | None | +-------+-----------+--------+-------+-------+ @@ -17,9 +17,9 @@ Column predictions: +-------+-----------+--------+-------+-------+ | Class | Precision | Recall | F1 | Count | +=======+===========+========+=======+=======+ -| 1 | 1 | 0.999 | 0.999 | 1692 | +| 1 | 0.999 | 1 | 0.999 | 1944 | +-------+-----------+--------+-------+-------+ -| 2 | 0.996 | 1 | 0.998 | 456 | +| 2 | 1 | 0.999 | 0.999 | 1356 | +-------+-----------+--------+-------+-------+ | AVG | 0.999 | 0.999 | 0.999 | None | +-------+-----------+--------+-------+-------+ \ No newline at end of file diff --git a/scripts/train/train_eval_orientation_classifier.py b/scripts/train/train_eval_orientation_classifier.py index abd558b6..27084950 100644 --- a/scripts/train/train_eval_orientation_classifier.py +++ b/scripts/train/train_eval_orientation_classifier.py @@ -1,10 +1,13 @@ import argparse import os +import shutil +import zipfile from time import time from typing import List import numpy as np import torch +from huggingface_hub import hf_hub_download from sklearn.metrics import precision_recall_fscore_support from texttable import Texttable from torch import nn @@ -19,17 +22,18 @@ parser = argparse.ArgumentParser() checkpoint_path_save = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "efficient_net_b0_fixed.pth")) checkpoint_path_load = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "efficient_net_b0_fixed.pth")) -checkpoint_path = "../../resources" -output_dir = os.path.abspath(os.path.join(checkpoint_path, "benchmarks")) +output_dir = os.path.abspath(os.path.join("..", "..", "resources", "benchmarks")) parser.add_argument("-t", "--train", type=bool, help="run for train model", default=False) parser.add_argument("-s", "--checkpoint_save", help="Path to checkpoint for save or load", default=checkpoint_path_save) parser.add_argument("-l", "--checkpoint_load", help="Path to checkpoint for load", default=checkpoint_path_load) parser.add_argument("-f", "--from_checkpoint", type=bool, help="run for train model", default=True) -parser.add_argument("-d", "--input_data_folder", help="Path to data with folders train or test") +parser.add_argument("-d", "--input_data_folder", help="Path to data with folders train or test", + default=os.path.join(get_config()["intermediate_data_path"], "orientation_columns_dataset")) +parser.add_argument("-b", "--batch_size", type=int, help="Batch size", default=1) args = parser.parse_args() -BATCH_SIZE = 1 +BATCH_SIZE = args.batch_size ON_GPU = True """ @@ -191,10 +195,38 @@ def train_step(data_executor: DataLoaderImageOrient, classifier: ColumnsOrientat train_model(trainloader, args.checkpoint_save, classifier) +def create_dataset() -> None: + if os.path.isdir(args.input_data_folder): + return + # download source files + datasets_path = os.path.join(get_config()["resources_path"], "datasets") + os.makedirs(datasets_path, exist_ok=True) + intermediate_path = os.path.realpath(hf_hub_download(repo_id="dedoc/orientation_columns_dataset", + filename="generate_dataset_orient_classifier.zip", + repo_type="dataset", + revision="902cc77dbd28e63dbb74dfc14a7a7b198e9d6f9d")) + source_dataset_archive = os.path.join(datasets_path, "generate_dataset_orient_classifier.zip") + shutil.move(intermediate_path, source_dataset_archive) + + with zipfile.ZipFile(source_dataset_archive, "r") as zip_ref: + zip_ref.extractall(datasets_path) + os.remove(source_dataset_archive) + + # rotate source files + src_pics_path = os.path.join(datasets_path, "generate_dataset_orient_classifier", "src") + scripts_path = os.path.join(datasets_path, "generate_dataset_orient_classifier", "scripts") + final_dataset_folder = os.path.join(get_config()["resources_path"], "datasets", "columns_orientation_dataset") + os.makedirs(final_dataset_folder, exist_ok=True) + + os.system(f"python3 {os.path.join(scripts_path, 'gen_dataset.py')} -i {src_pics_path} -o {final_dataset_folder}") + setattr(args, "input_data_folder", final_dataset_folder) # noqa: B010 + + if __name__ == "__main__": config = get_config() data_executor = DataLoaderImageOrient() - net = ColumnsOrientationClassifier(on_gpu=ON_GPU, checkpoint_path=checkpoint_path if not args.train else "", config=config) + create_dataset() + net = ColumnsOrientationClassifier(on_gpu=ON_GPU, checkpoint_path=args.checkpoint_load if args.from_checkpoint else "", config=config) if args.train: train_step(data_executor, net) else: diff --git a/tests/unit_tests/test_format_pdf_reader.py b/tests/unit_tests/test_format_pdf_reader.py index 5a21ae32..b93183f9 100644 --- a/tests/unit_tests/test_format_pdf_reader.py +++ b/tests/unit_tests/test_format_pdf_reader.py @@ -16,7 +16,7 @@ class TestPDFReader(unittest.TestCase): - checkpoint_path = get_test_config()["resources_path"] + checkpoint_path = os.path.join(get_test_config()["resources_path"], "scan_orientation_efficient_net_b0.pth") config = get_test_config() orientation_classifier = ColumnsOrientationClassifier(on_gpu=False, checkpoint_path=checkpoint_path, config=config) diff --git a/tests/unit_tests/test_misc_on_gpu.py b/tests/unit_tests/test_misc_on_gpu.py index 021325f1..23ed5278 100644 --- a/tests/unit_tests/test_misc_on_gpu.py +++ b/tests/unit_tests/test_misc_on_gpu.py @@ -32,7 +32,7 @@ def test_line_type_classifier(self) -> None: self.assertListEqual(predictions, ["header", "header", "cellar"]) def test_orientation_classifier(self) -> None: - checkpoint_path = get_test_config()["resources_path"] + checkpoint_path = os.path.join(get_test_config()["resources_path"], "scan_orientation_efficient_net_b0.pth") orientation_classifier = ColumnsOrientationClassifier(on_gpu=self.config.get("on_gpu", False), checkpoint_path=checkpoint_path, config=self.config) imgs_path = [f"../data/skew_corrector/rotated_{i}.jpg" for i in range(1, 5)]