Add online layout model prediction (#765)

mindspore-lab · Nov 18, 2024 · efb6c66 · efb6c66
1 parent 8f4baaf
commit efb6c66
Show file tree

Hide file tree

Showing 9 changed files with 415 additions and 1 deletion.
diff --git a/configs/layout/yolov8/images/example.jpg b/configs/layout/yolov8/images/example.jpg
diff --git a/mindocr/models/__init__.py b/mindocr/models/__init__.py
@@ -7,6 +7,7 @@
 from .det_fcenet import *
 from .det_psenet import *
 from .kie_layoutxlm import *
+from .layout_yolov8 import *
 from .rec_abinet import *
 from .rec_crnn import *
 from .rec_master import *

diff --git a/mindocr/models/layout_yolov8.py b/mindocr/models/layout_yolov8.py
@@ -0,0 +1,55 @@
+from ._registry import register_model
+from .backbones.mindcv_models.utils import load_pretrained
+from .base_model import BaseModel
+
+
+def _cfg(url="", **kwargs):
+    return {"url": url, **kwargs}
+
+
+default_cfgs = {
+    "yolov8": _cfg(
+        url="https://download.mindspore.cn/toolkits/mindocr/yolov8/yolov8n-4b9e8004.ckpt"
+    ),
+}
+
+
+class LayoutNet(BaseModel):
+    def __init__(self, config):
+        BaseModel.__init__(self, config)
+
+
+@register_model
+def yolov8(pretrained=False, pretrained_backbone=True, **kwargs):
+    model_config = {
+        "backbone": {
+            "name": "yolov8_backbone",
+            "depth_multiple": 0.33,
+            "width_multiple": 0.25,
+            "max_channels": 1024,
+            "nc": 5,
+            "stride": [8, 16, 32, 64],
+            "reg_max": 16,
+            "sync_bn": False,
+            "out_channels": [64, 128, 192, 256],
+        },
+        "neck": {
+            "name": "YOLOv8Neck",
+            "index": [20, 23, 26, 29],
+        },
+        "head": {
+            "name": "YOLOv8Head",
+            "nc": 5,
+            "reg_max": 16,
+            "stride": [8, 16, 32, 64],
+            "sync_bn": False,
+        },
+    }
+    model = LayoutNet(model_config)
+
+    # load pretrained weights
+    if pretrained:
+        default_cfg = default_cfgs['yolov8']
+        load_pretrained(model, default_cfg)
+
+    return model
diff --git a/tools/infer/text/README.md b/tools/infer/text/README.md
@@ -191,7 +191,60 @@ web_cvpr.png	[{"transcription": "canada", "points": [[430, 148], [540, 148], [54
 **Notes:**
 1. For more argument illustrations and usage, please run `python tools/infer/text/predict_system.py -h` or view `tools/infer/text/config.py`
 
+## Layout Analysis
 
+To run layout analysis on an input image or a directory containing multiple images, please execute
+```shell
+python tools/infer/text/predict_layout.py  --image_dir {path_to_img or dir_to_imgs} --layout_algorithm YOLOv8 --visualize_output True
+```
+After running, the inference results will be saved in `{args.draw_img_save_dir}/det_results.txt`, where `--draw_img_save_dir` is the directory for saving  results and is set to `./inference_results` by default Here are some results for examples.
+
+Example 1:
+<p align="center">
+  <img src="../../../configs/layout/yolov8/images/result.png" width=480>
+</p>
+<p align="center">
+  <em> Visualization of layout analysis result on PMC4958442_00003.jpg</em>
+</p>
+
+, where the saved layout_result.txt file is as follows
+```
+{"image_id": 0, "category_id": 1, "bbox": [308.649, 559.189, 240.211, 81.412], "score": 0.98431}
+{"image_id": 0, "category_id": 1, "bbox": [50.435, 673.018, 240.232, 70.262], "score": 0.98414}
+{"image_id": 0, "category_id": 3, "bbox": [322.805, 348.831, 225.949, 203.302], "score": 0.98019}
+{"image_id": 0, "category_id": 1, "bbox": [308.658, 638.657, 240.31, 70.583], "score": 0.97986}
+{"image_id": 0, "category_id": 1, "bbox": [50.616, 604.736, 240.044, 70.086], "score": 0.9797}
+{"image_id": 0, "category_id": 1, "bbox": [50.409, 423.237, 240.132, 183.652], "score": 0.97805}
+{"image_id": 0, "category_id": 1, "bbox": [308.66, 293.918, 240.181, 47.497], "score": 0.97471}
+{"image_id": 0, "category_id": 1, "bbox": [308.64, 707.13, 240.271, 36.028], "score": 0.97427}
+{"image_id": 0, "category_id": 1, "bbox": [308.697, 230.568, 240.062, 43.545], "score": 0.96921}
+{"image_id": 0, "category_id": 4, "bbox": [51.787, 100.444, 240.267, 273.653], "score": 0.96839}
+{"image_id": 0, "category_id": 5, "bbox": [308.637, 74.439, 237.878, 149.174], "score": 0.96707}
+{"image_id": 0, "category_id": 1, "bbox": [50.615, 70.667, 240.068, 22.0], "score": 0.94156}
+{"image_id": 0, "category_id": 2, "bbox": [50.549, 403.5, 67.392, 12.85], "score": 0.92577}
+{"image_id": 0, "category_id": 1, "bbox": [51.384, 374.84, 171.939, 10.736], "score": 0.76692}
+```
+In this file, `image_id` is the image ID, `bbox` is the detected bounding box `[x-coordinate of the top-left corner, y-coordinate of the bottom-right corner, width, height]`, `score` is the detection confidence, and `category_id` has the following meanings:
+- `1: text`
+- `2: title`
+- `3: list`
+- `4: table`
+- `5: figure`
+
+**Notes:**
+- For more argument illustrations and usage, please run `python tools/infer/text/predict_layout.py -h` or view `tools/infer/text/config.py`
+
+### Supported Detection Algorithms and Networks
+
+<center>
+
+  | **Algorithm Name** | **Network Name** | **Language** |
+  | :------: | :------: | :------: |
+  |YOLOv8 | yolov8 |English|
+
+</center>
+
+The algorithm-network mapping is defined in `tools/infer/text/predict_layout.py`.
 
 ### Evaluation of the Inference Results
 

diff --git a/tools/infer/text/README_CN.md b/tools/infer/text/README_CN.md
@@ -315,6 +315,61 @@ HDL Cholesterol (mg/dL),42 ± 11.1,46 ± 11.4
 
 所有CLI参数定义都可以通过`python tools/infer/text/predict_system.py -h`或`tools/infer/text/config.py`查看。
 
+## 版面分析
+
+要对输入图像或包含多个图像的目录运行版面分析，请执行
+```shell
+python tools/infer/text/predict_layout.py  --image_dir {path_to_img or dir_to_imgs} --layout_algorithm YOLOv8 --visualize_output True
+```
+运行后，推理结果保存在`{args.draw_img_save_dir}/det_results.txt`中，其中`--draw_img_save_dir`是保存结果的目录，这是`./inference_results`的默认设置，这里是一些示例结果。
+
+事例1:
+<p align="center">
+  <img src="../../../configs/layout/yolov8/images/result.png" width=480>
+</p>
+<p align="center">
+  <em> PMC4958442_00003.jpg的可视化结果</em>
+</p>
+
+其中保存的layout_result.txt文件如下
+```
+{"image_id": 0, "category_id": 1, "bbox": [308.649, 559.189, 240.211, 81.412], "score": 0.98431}
+{"image_id": 0, "category_id": 1, "bbox": [50.435, 673.018, 240.232, 70.262], "score": 0.98414}
+{"image_id": 0, "category_id": 3, "bbox": [322.805, 348.831, 225.949, 203.302], "score": 0.98019}
+{"image_id": 0, "category_id": 1, "bbox": [308.658, 638.657, 240.31, 70.583], "score": 0.97986}
+{"image_id": 0, "category_id": 1, "bbox": [50.616, 604.736, 240.044, 70.086], "score": 0.9797}
+{"image_id": 0, "category_id": 1, "bbox": [50.409, 423.237, 240.132, 183.652], "score": 0.97805}
+{"image_id": 0, "category_id": 1, "bbox": [308.66, 293.918, 240.181, 47.497], "score": 0.97471}
+{"image_id": 0, "category_id": 1, "bbox": [308.64, 707.13, 240.271, 36.028], "score": 0.97427}
+{"image_id": 0, "category_id": 1, "bbox": [308.697, 230.568, 240.062, 43.545], "score": 0.96921}
+{"image_id": 0, "category_id": 4, "bbox": [51.787, 100.444, 240.267, 273.653], "score": 0.96839}
+{"image_id": 0, "category_id": 5, "bbox": [308.637, 74.439, 237.878, 149.174], "score": 0.96707}
+{"image_id": 0, "category_id": 1, "bbox": [50.615, 70.667, 240.068, 22.0], "score": 0.94156}
+{"image_id": 0, "category_id": 2, "bbox": [50.549, 403.5, 67.392, 12.85], "score": 0.92577}
+{"image_id": 0, "category_id": 1, "bbox": [51.384, 374.84, 171.939, 10.736], "score": 0.76692}
+```
+其中，`image_id`为图像ID，`bbox`为检测出的边界框`[左上角的x坐标，右下角的y坐标，宽度，高度]`, `score`是检测的置信度，`category_id`的含义如下：
+- `1: text`
+- `2: title`
+- `3: list`
+- `4: table`
+- `5: figure`
+
+**注意事项：**
+- 有关更多参数说明和用法，请运行`python tools/infer/text/predict_layout.py -h`或查看`tools/infer/text/config.py`
+
+### 支持的检测算法和网络
+
+<center>
+
+  |**算法名称**|**网络名称**|**语言**|
+  | :------: | :------: | :------: |
+  |YOLOv8 | yolov8 |英语|
+
+</center>
+
+算法网络在`tools/infer/text/predict_layout.py`中定义。
+
 ## 开发人员指南-如何添加新的推断模型
 
 ### 预处理

diff --git a/tools/infer/text/config.py b/tools/infer/text/config.py
@@ -197,6 +197,24 @@ def create_parser():
         "--table_max_len", type=int, default=480, help="max length of the input image for table structure recognition."
     )
 
+    parser.add_argument(
+        "--layout_algorithm", type=str, default="YOLOv8", choices=["YOLOv8"], help="layout analyzer algorithm"
+    )
+
+    parser.add_argument(
+        "--layout_model_dir",
+        type=str,
+        help="directory containing the layout model checkpoint best.ckpt, or path to a specific checkpoint file.",
+    )  # determine the network weights
+
+    parser.add_argument(
+        "--layout_amp_level",
+        type=str,
+        default="O0",
+        choices=["O0", "O1", "O2", "O3"],
+        help="Auto Mixed Precision level. This setting only works on GPU and Ascend",
+    )
+
     return parser
 
 

diff --git a/tools/infer/text/postprocess.py b/tools/infer/text/postprocess.py
@@ -91,6 +91,8 @@ def __init__(self, task="det", algo="DB", rec_char_dict_path=None, **kwargs):
                 merge_no_span_structure=True,
                 box_shape="pad",
             )
+        elif task == "layout":
+            postproc_cfg = dict(name="YOLOv8Postprocess", conf_thres=0.5, iou_thres=0.7, conf_free=True)
 
         postproc_cfg.update(kwargs)
         self.task = task
@@ -154,4 +156,6 @@ def __call__(self, pred, data=None, **kwargs):
             return output
         elif self.task == "table":
             output = self.postprocess(pred, labels=kwargs.get("labels"))
+        elif self.task == "layout":
+            output = self.postprocess(pred, img_shape=kwargs.get("img_shape"), meta_info=kwargs.get("meta_info"))
             return output