Skip to content

Commit

Permalink
Add online layout model prediction (#765)
Browse files Browse the repository at this point in the history
  • Loading branch information
can-gaa-hou authored Nov 18, 2024
1 parent 8f4baaf commit efb6c66
Show file tree
Hide file tree
Showing 9 changed files with 415 additions and 1 deletion.
Binary file added configs/layout/yolov8/images/example.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions mindocr/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .det_fcenet import *
from .det_psenet import *
from .kie_layoutxlm import *
from .layout_yolov8 import *
from .rec_abinet import *
from .rec_crnn import *
from .rec_master import *
Expand Down
55 changes: 55 additions & 0 deletions mindocr/models/layout_yolov8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from ._registry import register_model
from .backbones.mindcv_models.utils import load_pretrained
from .base_model import BaseModel


def _cfg(url="", **kwargs):
return {"url": url, **kwargs}


default_cfgs = {
"yolov8": _cfg(
url="https://download.mindspore.cn/toolkits/mindocr/yolov8/yolov8n-4b9e8004.ckpt"
),
}


class LayoutNet(BaseModel):
def __init__(self, config):
BaseModel.__init__(self, config)


@register_model
def yolov8(pretrained=False, pretrained_backbone=True, **kwargs):
model_config = {
"backbone": {
"name": "yolov8_backbone",
"depth_multiple": 0.33,
"width_multiple": 0.25,
"max_channels": 1024,
"nc": 5,
"stride": [8, 16, 32, 64],
"reg_max": 16,
"sync_bn": False,
"out_channels": [64, 128, 192, 256],
},
"neck": {
"name": "YOLOv8Neck",
"index": [20, 23, 26, 29],
},
"head": {
"name": "YOLOv8Head",
"nc": 5,
"reg_max": 16,
"stride": [8, 16, 32, 64],
"sync_bn": False,
},
}
model = LayoutNet(model_config)

# load pretrained weights
if pretrained:
default_cfg = default_cfgs['yolov8']
load_pretrained(model, default_cfg)

return model
53 changes: 53 additions & 0 deletions tools/infer/text/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,60 @@ web_cvpr.png [{"transcription": "canada", "points": [[430, 148], [540, 148], [54
**Notes:**
1. For more argument illustrations and usage, please run `python tools/infer/text/predict_system.py -h` or view `tools/infer/text/config.py`

## Layout Analysis

To run layout analysis on an input image or a directory containing multiple images, please execute
```shell
python tools/infer/text/predict_layout.py --image_dir {path_to_img or dir_to_imgs} --layout_algorithm YOLOv8 --visualize_output True
```
After running, the inference results will be saved in `{args.draw_img_save_dir}/det_results.txt`, where `--draw_img_save_dir` is the directory for saving results and is set to `./inference_results` by default Here are some results for examples.

Example 1:
<p align="center">
<img src="../../../configs/layout/yolov8/images/result.png" width=480>
</p>
<p align="center">
<em> Visualization of layout analysis result on PMC4958442_00003.jpg</em>
</p>

, where the saved layout_result.txt file is as follows
```
{"image_id": 0, "category_id": 1, "bbox": [308.649, 559.189, 240.211, 81.412], "score": 0.98431}
{"image_id": 0, "category_id": 1, "bbox": [50.435, 673.018, 240.232, 70.262], "score": 0.98414}
{"image_id": 0, "category_id": 3, "bbox": [322.805, 348.831, 225.949, 203.302], "score": 0.98019}
{"image_id": 0, "category_id": 1, "bbox": [308.658, 638.657, 240.31, 70.583], "score": 0.97986}
{"image_id": 0, "category_id": 1, "bbox": [50.616, 604.736, 240.044, 70.086], "score": 0.9797}
{"image_id": 0, "category_id": 1, "bbox": [50.409, 423.237, 240.132, 183.652], "score": 0.97805}
{"image_id": 0, "category_id": 1, "bbox": [308.66, 293.918, 240.181, 47.497], "score": 0.97471}
{"image_id": 0, "category_id": 1, "bbox": [308.64, 707.13, 240.271, 36.028], "score": 0.97427}
{"image_id": 0, "category_id": 1, "bbox": [308.697, 230.568, 240.062, 43.545], "score": 0.96921}
{"image_id": 0, "category_id": 4, "bbox": [51.787, 100.444, 240.267, 273.653], "score": 0.96839}
{"image_id": 0, "category_id": 5, "bbox": [308.637, 74.439, 237.878, 149.174], "score": 0.96707}
{"image_id": 0, "category_id": 1, "bbox": [50.615, 70.667, 240.068, 22.0], "score": 0.94156}
{"image_id": 0, "category_id": 2, "bbox": [50.549, 403.5, 67.392, 12.85], "score": 0.92577}
{"image_id": 0, "category_id": 1, "bbox": [51.384, 374.84, 171.939, 10.736], "score": 0.76692}
```
In this file, `image_id` is the image ID, `bbox` is the detected bounding box `[x-coordinate of the top-left corner, y-coordinate of the bottom-right corner, width, height]`, `score` is the detection confidence, and `category_id` has the following meanings:
- `1: text`
- `2: title`
- `3: list`
- `4: table`
- `5: figure`

**Notes:**
- For more argument illustrations and usage, please run `python tools/infer/text/predict_layout.py -h` or view `tools/infer/text/config.py`

### Supported Detection Algorithms and Networks

<center>

| **Algorithm Name** | **Network Name** | **Language** |
| :------: | :------: | :------: |
|YOLOv8 | yolov8 |English|

</center>

The algorithm-network mapping is defined in `tools/infer/text/predict_layout.py`.

### Evaluation of the Inference Results

Expand Down
55 changes: 55 additions & 0 deletions tools/infer/text/README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,61 @@ HDL Cholesterol (mg/dL),42 ± 11.1,46 ± 11.4

所有CLI参数定义都可以通过`python tools/infer/text/predict_system.py -h``tools/infer/text/config.py`查看。

## 版面分析

要对输入图像或包含多个图像的目录运行版面分析,请执行
```shell
python tools/infer/text/predict_layout.py --image_dir {path_to_img or dir_to_imgs} --layout_algorithm YOLOv8 --visualize_output True
```
运行后,推理结果保存在`{args.draw_img_save_dir}/det_results.txt`中,其中`--draw_img_save_dir`是保存结果的目录,这是`./inference_results`的默认设置,这里是一些示例结果。

事例1:
<p align="center">
<img src="../../../configs/layout/yolov8/images/result.png" width=480>
</p>
<p align="center">
<em> PMC4958442_00003.jpg的可视化结果</em>
</p>

其中保存的layout_result.txt文件如下
```
{"image_id": 0, "category_id": 1, "bbox": [308.649, 559.189, 240.211, 81.412], "score": 0.98431}
{"image_id": 0, "category_id": 1, "bbox": [50.435, 673.018, 240.232, 70.262], "score": 0.98414}
{"image_id": 0, "category_id": 3, "bbox": [322.805, 348.831, 225.949, 203.302], "score": 0.98019}
{"image_id": 0, "category_id": 1, "bbox": [308.658, 638.657, 240.31, 70.583], "score": 0.97986}
{"image_id": 0, "category_id": 1, "bbox": [50.616, 604.736, 240.044, 70.086], "score": 0.9797}
{"image_id": 0, "category_id": 1, "bbox": [50.409, 423.237, 240.132, 183.652], "score": 0.97805}
{"image_id": 0, "category_id": 1, "bbox": [308.66, 293.918, 240.181, 47.497], "score": 0.97471}
{"image_id": 0, "category_id": 1, "bbox": [308.64, 707.13, 240.271, 36.028], "score": 0.97427}
{"image_id": 0, "category_id": 1, "bbox": [308.697, 230.568, 240.062, 43.545], "score": 0.96921}
{"image_id": 0, "category_id": 4, "bbox": [51.787, 100.444, 240.267, 273.653], "score": 0.96839}
{"image_id": 0, "category_id": 5, "bbox": [308.637, 74.439, 237.878, 149.174], "score": 0.96707}
{"image_id": 0, "category_id": 1, "bbox": [50.615, 70.667, 240.068, 22.0], "score": 0.94156}
{"image_id": 0, "category_id": 2, "bbox": [50.549, 403.5, 67.392, 12.85], "score": 0.92577}
{"image_id": 0, "category_id": 1, "bbox": [51.384, 374.84, 171.939, 10.736], "score": 0.76692}
```
其中,`image_id`为图像ID,`bbox`为检测出的边界框`[左上角的x坐标,右下角的y坐标,宽度,高度]`, `score`是检测的置信度,`category_id`的含义如下:
- `1: text`
- `2: title`
- `3: list`
- `4: table`
- `5: figure`

**注意事项:**
- 有关更多参数说明和用法,请运行`python tools/infer/text/predict_layout.py -h`或查看`tools/infer/text/config.py`

### 支持的检测算法和网络

<center>

|**算法名称**|**网络名称**|**语言**|
| :------: | :------: | :------: |
|YOLOv8 | yolov8 |英语|

</center>

算法网络在`tools/infer/text/predict_layout.py`中定义。

## 开发人员指南-如何添加新的推断模型

### 预处理
Expand Down
18 changes: 18 additions & 0 deletions tools/infer/text/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,24 @@ def create_parser():
"--table_max_len", type=int, default=480, help="max length of the input image for table structure recognition."
)

parser.add_argument(
"--layout_algorithm", type=str, default="YOLOv8", choices=["YOLOv8"], help="layout analyzer algorithm"
)

parser.add_argument(
"--layout_model_dir",
type=str,
help="directory containing the layout model checkpoint best.ckpt, or path to a specific checkpoint file.",
) # determine the network weights

parser.add_argument(
"--layout_amp_level",
type=str,
default="O0",
choices=["O0", "O1", "O2", "O3"],
help="Auto Mixed Precision level. This setting only works on GPU and Ascend",
)

return parser


Expand Down
4 changes: 4 additions & 0 deletions tools/infer/text/postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def __init__(self, task="det", algo="DB", rec_char_dict_path=None, **kwargs):
merge_no_span_structure=True,
box_shape="pad",
)
elif task == "layout":
postproc_cfg = dict(name="YOLOv8Postprocess", conf_thres=0.5, iou_thres=0.7, conf_free=True)

postproc_cfg.update(kwargs)
self.task = task
Expand Down Expand Up @@ -154,4 +156,6 @@ def __call__(self, pred, data=None, **kwargs):
return output
elif self.task == "table":
output = self.postprocess(pred, labels=kwargs.get("labels"))
elif self.task == "layout":
output = self.postprocess(pred, img_shape=kwargs.get("img_shape"), meta_info=kwargs.get("meta_info"))
return output
Loading

0 comments on commit efb6c66

Please sign in to comment.