Skip to content

Commit

Permalink
fix PP-OCRv3 DBNet and SVTR param maps (mindspore-lab#632)
Browse files Browse the repository at this point in the history
  • Loading branch information
tonytonglt authored and panshaowu committed Dec 14, 2023
1 parent f50b4f3 commit f7e7f2d
Show file tree
Hide file tree
Showing 24 changed files with 9,251 additions and 525 deletions.
1 change: 1 addition & 0 deletions IDCardCamera
Submodule IDCardCamera added at 52e863
1 change: 1 addition & 0 deletions IdCardOCR
Submodule IdCardOCR added at 2b54c8
907 changes: 606 additions & 301 deletions configs/det/dbnet/db_mobilenetv3_ppocrv3_param_map.json

Large diffs are not rendered by default.

126 changes: 126 additions & 0 deletions configs/kie/ser_layoutxlm_xfund_zh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
system:
mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
distribute: False
amp_level: 'O0'
seed: 42
# log_interval: 10
val_while_train: True
drop_overflow_update: False

model:
type: kie
transform: null
backbone:
name: layoutxlm_for_ser
pretrained: True
checkpoints:
num_classes: &num_classes 7

postprocess:
name: VQASerTokenLayoutLMPostProcess
class_path: &class_path train_data/xfund/class_list_xfun.txt

metric:
name: VQASerTokenMetric
main_indicator: hmean

loss:
name: VQASerTokenLayoutLMLoss
num_classes: *num_classes
key: "backbone_out"

scheduler:
scheduler: polynomial_decay
lr: 0.00005
min_lr: 0.0000002
num_epochs: 200
warmup_epochs: 2

optimizer:
opt: adam
filter_bias_and_bn: False
weight_decay: 0.0005

train:
ckpt_save_dir: './tmp_kie'
dataset_sink_mode: False
dataset:
type: KieDataset
dataset_root: train_data
data_dir: xfund/zh_train/image
label_file: xfund/zh_train/train.json
sample_ratio: 1.0
transform_pipeline:
- DecodeImage:
img_mode: RGB
to_float32: False
- VQATokenLabelEncode:
contains_re: False
algorithm: &algorithm LayoutXLM
class_path: *class_path
- VQATokenPad:
max_seq_len: &max_seq_len 512
return_attention_mask: True
- VQASerTokenChunk:
max_seq_len: *max_seq_len
- LayoutResize:
size: [ 224, 224 ]
- NormalizeImage:
bgr_to_rgb: False
is_hwc: True
mean: imagenet
std: imagenet
- ToCHWImage:
# the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visualize
output_columns: [ 'input_ids', 'bbox','attention_mask','token_type_ids', 'image', 'labels' ]
net_input_column_index: [ 0, 1, 2, 3, 4 ] # input indices for network forward func in output_columns
label_column_index: [ 2, 5 ] # input indices marked as label

loader:
shuffle: True
batch_size: 8
drop_remainder: True
num_workers: 8

eval:
ckpt_load_path: './tmp_kie/best.ckpt'
dataset_sink_mode: False
dataset:
type: KieDataset
dataset_root: train_data
data_dir: xfund/zh_val/image
label_file: xfund/zh_val/val.json
sample_ratio: 1.0
shuffle: False
transform_pipeline:
- DecodeImage:
img_mode: RGB
to_float32: False
- VQATokenLabelEncode:
contains_re: False
algorithm: *algorithm
class_path: *class_path
- VQATokenPad:
max_seq_len: *max_seq_len
return_attention_mask: True
- VQASerTokenChunk:
max_seq_len: *max_seq_len
- LayoutResize:
size: [ 224, 224 ]
- NormalizeImage:
bgr_to_rgb: False
is_hwc: True
mean: imagenet
std: imagenet
- ToCHWImage:
# the order of the dataloader list, matching the network input and the labels for evaluation
output_columns: [ 'input_ids', 'bbox', 'attention_mask','token_type_ids','image', 'labels' ]
net_input_column_index: [ 0, 1, 2, 3, 4 ] # input indices for network forward func in output_columns
label_column_index: [ 2, 5 ] # input indices marked as label
# num_keys_of_labels: 2 # num labels

loader:
shuffle: False
batch_size: 1
drop_remainder: False
num_workers: 1
673 changes: 450 additions & 223 deletions configs/rec/svtr/svtr_ppocrv3_ch_param_map.json

Large diffs are not rendered by default.

44 changes: 44 additions & 0 deletions mindocr/losses/layout_loss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import mindspore as ms
from mindspore import nn, ops

__all__ = ["VQASerTokenLayoutLMLoss", "LossFromOutput"]


class VQASerTokenLayoutLMLoss(nn.LossBase):
def __init__(self, num_classes, key=None):
super().__init__()
self.loss_class = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="none")
self.num_classes = num_classes
self.key = key

def construct(self, predicts, attention_mask, labels):
if isinstance(predicts, dict) and self.key is not None:
predicts = predicts[self.key]
if attention_mask is not None:

loss = self.loss_class(predicts.reshape((-1, self.num_classes)).astype(ms.float32),
labels.reshape((-1,)).astype(ms.int32))
attention_mask = attention_mask.reshape((-1,))
loss = ops.mul(loss, attention_mask)
loss = loss[loss > 0]
else:
loss = self.loss_class(predicts.reshape((-1, self.num_classes)).astype(ms.float32),
labels.reshape((-1,)).astype(ms.int32))
return ops.reduce_mean(loss)


class LossFromOutput(nn.LossBase):
def __init__(self, key="loss", reduction="none"):
super().__init__()
self.key = key
self.reduction = reduction

def construct(self, predicts, batch):
loss = predicts
if self.key is not None and isinstance(predicts, dict):
loss = loss[self.key]
if self.reduction == "mean":
loss = ops.mean(loss)
elif self.reduction == "sum":
loss = ops.sum(loss)
return loss
6 changes: 6 additions & 0 deletions mindocr/models/backbones/layout_xlm_ms/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .configuration import LayoutXLMPretrainedConfig
from .layoutxlm import LayoutXLMModel, layoutxlm_for_re, layoutxlm_for_ser
from .tokenizer import LayoutXLMTokenizer

__all__ = ["LayoutXLMModel", "LayoutXLMPretrainedConfig",
"LayoutXLMTokenizer", "layoutxlm_for_ser", "layoutxlm_for_re"]
35 changes: 35 additions & 0 deletions mindocr/models/backbones/layout_xlm_ms/configuration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from dataclasses import dataclass


@dataclass
class LayoutXLMPretrainedConfig:
attention_probs_dropout_prob = 0.1
bos_token_id = 0
coordinate_size = 128
eos_token_id = 2
fast_qkv = False
gradient_checkpointing = False
has_relative_attention_bias = False
has_spatial_attention_bias = False
has_visual_segment_embedding = True
hidden_act = "gelu"
hidden_dropout_prob = 0.1
hidden_size = 768
image_feature_pool_shape = [7, 7, 256]
initializer_range = 0.02
intermediate_size = 3072
layer_norm_eps = 1e-05
max_2d_position_embeddings = 1024
max_position_embeddings = 514
max_rel_2d_pos = 256
max_rel_pos = 128
model_type = "layoutxlm"
num_attention_heads = 12
num_hidden_layers = 12
output_past = True
pad_token_id = 1
shape_size = 128
rel_2d_pos_bins = 64
rel_pos_bins = 32
type_vocab_size = 1
vocab_size = 250002
42 changes: 42 additions & 0 deletions mindocr/models/backbones/layout_xlm_ms/conversion_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# convert pytorch model to mindspore
import json
import os

from transformers import LayoutLMv2Model

import mindspore as ms


def set_proxy():
proxy_addr = "http://127.0.0.1:7078" # your proxy addr
os.environ['http_proxy'] = proxy_addr
os.environ['https_proxy'] = proxy_addr


def unset_proxy():
os.environ.pop("http_proxy")
os.environ.pop("https_proxy")


# load param_map json
with open("param_map.json", "r") as json_file:
param_name_map = json.load(json_file)

# use proxy if you needed
set_proxy()

# load pytorch model
model = LayoutLMv2Model.from_pretrained("microsoft/layoutxlm-base")
params_dict = model.state_dict()

# conversion
ms_params = []
for name, value in params_dict.items():
each_param = dict()
each_param["name"] = param_name_map[name]
each_param["data"] = ms.Tensor(value.numpy())
ms_params.append(each_param)

ms.save_checkpoint(ms_params, "layoutxlm-base.ckpt")

unset_proxy()
Loading

0 comments on commit f7e7f2d

Please sign in to comment.