fix PP-OCRv3 DBNet and SVTR param maps (mindspore-lab#632)

panshaowu · Dec 14, 2023 · f7e7f2d · f7e7f2d
1 parent f50b4f3
commit f7e7f2d
Show file tree

Hide file tree

Showing 24 changed files with 9,251 additions and 525 deletions.
diff --git a/IDCardCamera b/IDCardCamera
diff --git a/IdCardOCR b/IdCardOCR
diff --git a/configs/det/dbnet/db_mobilenetv3_ppocrv3_param_map.json b/configs/det/dbnet/db_mobilenetv3_ppocrv3_param_map.json
diff --git a/configs/kie/ser_layoutxlm_xfund_zh.yaml b/configs/kie/ser_layoutxlm_xfund_zh.yaml
@@ -0,0 +1,126 @@
+system:
+  mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore
+  distribute: False
+  amp_level: 'O0'
+  seed: 42
+  # log_interval: 10
+  val_while_train: True
+  drop_overflow_update: False
+
+model:
+  type: kie
+  transform: null
+  backbone:
+    name: layoutxlm_for_ser
+    pretrained: True
+    checkpoints:
+    num_classes: &num_classes 7
+
+postprocess:
+  name: VQASerTokenLayoutLMPostProcess
+  class_path: &class_path train_data/xfund/class_list_xfun.txt
+
+metric:
+  name: VQASerTokenMetric
+  main_indicator: hmean
+
+loss:
+  name: VQASerTokenLayoutLMLoss
+  num_classes: *num_classes
+  key: "backbone_out"
+
+scheduler:
+  scheduler: polynomial_decay
+  lr: 0.00005
+  min_lr: 0.0000002
+  num_epochs: 200
+  warmup_epochs: 2
+
+optimizer:
+  opt: adam
+  filter_bias_and_bn: False
+  weight_decay: 0.0005
+
+train:
+  ckpt_save_dir: './tmp_kie'
+  dataset_sink_mode: False
+  dataset:
+    type: KieDataset
+    dataset_root: train_data
+    data_dir: xfund/zh_train/image
+    label_file: xfund/zh_train/train.json
+    sample_ratio: 1.0
+    transform_pipeline:
+      - DecodeImage:
+          img_mode: RGB
+          to_float32: False
+      - VQATokenLabelEncode:
+          contains_re: False
+          algorithm: &algorithm LayoutXLM
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - LayoutResize:
+          size: [ 224, 224 ]
+      - NormalizeImage:
+          bgr_to_rgb: False
+          is_hwc: True
+          mean: imagenet
+          std: imagenet
+      - ToCHWImage:
+    #  the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visualize
+    output_columns: [ 'input_ids', 'bbox','attention_mask','token_type_ids', 'image', 'labels' ]
+    net_input_column_index: [ 0, 1, 2, 3, 4 ] # input indices for network forward func in output_columns
+    label_column_index: [ 2, 5 ] # input indices marked as label
+
+  loader:
+    shuffle: True
+    batch_size: 8
+    drop_remainder: True
+    num_workers: 8
+
+eval:
+  ckpt_load_path: './tmp_kie/best.ckpt'
+  dataset_sink_mode: False
+  dataset:
+    type: KieDataset
+    dataset_root: train_data
+    data_dir: xfund/zh_val/image
+    label_file: xfund/zh_val/val.json
+    sample_ratio: 1.0
+    shuffle: False
+    transform_pipeline:
+      - DecodeImage:
+          img_mode: RGB
+          to_float32: False
+      - VQATokenLabelEncode:
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - LayoutResize:
+          size: [ 224, 224 ]
+      - NormalizeImage:
+          bgr_to_rgb: False
+          is_hwc: True
+          mean: imagenet
+          std: imagenet
+      - ToCHWImage:
+    #  the order of the dataloader list, matching the network input and the labels for evaluation
+    output_columns: [ 'input_ids', 'bbox', 'attention_mask','token_type_ids','image', 'labels' ]
+    net_input_column_index: [ 0, 1, 2, 3, 4 ] # input indices for network forward func in output_columns
+    label_column_index: [ 2, 5 ] # input indices marked as label
+  #    num_keys_of_labels: 2 # num labels
+
+  loader:
+    shuffle: False
+    batch_size: 1
+    drop_remainder: False
+    num_workers: 1
diff --git a/configs/rec/svtr/svtr_ppocrv3_ch_param_map.json b/configs/rec/svtr/svtr_ppocrv3_ch_param_map.json
diff --git a/mindocr/losses/layout_loss.py b/mindocr/losses/layout_loss.py
@@ -0,0 +1,44 @@
+import mindspore as ms
+from mindspore import nn, ops
+
+__all__ = ["VQASerTokenLayoutLMLoss", "LossFromOutput"]
+
+
+class VQASerTokenLayoutLMLoss(nn.LossBase):
+    def __init__(self, num_classes, key=None):
+        super().__init__()
+        self.loss_class = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="none")
+        self.num_classes = num_classes
+        self.key = key
+
+    def construct(self, predicts, attention_mask, labels):
+        if isinstance(predicts, dict) and self.key is not None:
+            predicts = predicts[self.key]
+        if attention_mask is not None:
+
+            loss = self.loss_class(predicts.reshape((-1, self.num_classes)).astype(ms.float32),
+                                   labels.reshape((-1,)).astype(ms.int32))
+            attention_mask = attention_mask.reshape((-1,))
+            loss = ops.mul(loss, attention_mask)
+            loss = loss[loss > 0]
+        else:
+            loss = self.loss_class(predicts.reshape((-1, self.num_classes)).astype(ms.float32),
+                                   labels.reshape((-1,)).astype(ms.int32))
+        return ops.reduce_mean(loss)
+
+
+class LossFromOutput(nn.LossBase):
+    def __init__(self, key="loss", reduction="none"):
+        super().__init__()
+        self.key = key
+        self.reduction = reduction
+
+    def construct(self, predicts, batch):
+        loss = predicts
+        if self.key is not None and isinstance(predicts, dict):
+            loss = loss[self.key]
+        if self.reduction == "mean":
+            loss = ops.mean(loss)
+        elif self.reduction == "sum":
+            loss = ops.sum(loss)
+        return loss
diff --git a/mindocr/models/backbones/layout_xlm_ms/__init__.py b/mindocr/models/backbones/layout_xlm_ms/__init__.py
@@ -0,0 +1,6 @@
+from .configuration import LayoutXLMPretrainedConfig
+from .layoutxlm import LayoutXLMModel, layoutxlm_for_re, layoutxlm_for_ser
+from .tokenizer import LayoutXLMTokenizer
+
+__all__ = ["LayoutXLMModel", "LayoutXLMPretrainedConfig",
+           "LayoutXLMTokenizer", "layoutxlm_for_ser", "layoutxlm_for_re"]
diff --git a/mindocr/models/backbones/layout_xlm_ms/configuration.py b/mindocr/models/backbones/layout_xlm_ms/configuration.py
@@ -0,0 +1,35 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class LayoutXLMPretrainedConfig:
+    attention_probs_dropout_prob = 0.1
+    bos_token_id = 0
+    coordinate_size = 128
+    eos_token_id = 2
+    fast_qkv = False
+    gradient_checkpointing = False
+    has_relative_attention_bias = False
+    has_spatial_attention_bias = False
+    has_visual_segment_embedding = True
+    hidden_act = "gelu"
+    hidden_dropout_prob = 0.1
+    hidden_size = 768
+    image_feature_pool_shape = [7, 7, 256]
+    initializer_range = 0.02
+    intermediate_size = 3072
+    layer_norm_eps = 1e-05
+    max_2d_position_embeddings = 1024
+    max_position_embeddings = 514
+    max_rel_2d_pos = 256
+    max_rel_pos = 128
+    model_type = "layoutxlm"
+    num_attention_heads = 12
+    num_hidden_layers = 12
+    output_past = True
+    pad_token_id = 1
+    shape_size = 128
+    rel_2d_pos_bins = 64
+    rel_pos_bins = 32
+    type_vocab_size = 1
+    vocab_size = 250002
diff --git a/mindocr/models/backbones/layout_xlm_ms/conversion_utils.py b/mindocr/models/backbones/layout_xlm_ms/conversion_utils.py
@@ -0,0 +1,42 @@
+# convert pytorch model to mindspore
+import json
+import os
+
+from transformers import LayoutLMv2Model
+
+import mindspore as ms
+
+
+def set_proxy():
+    proxy_addr = "http://127.0.0.1:7078"  # your proxy addr
+    os.environ['http_proxy'] = proxy_addr
+    os.environ['https_proxy'] = proxy_addr
+
+
+def unset_proxy():
+    os.environ.pop("http_proxy")
+    os.environ.pop("https_proxy")
+
+
+# load param_map json
+with open("param_map.json", "r") as json_file:
+    param_name_map = json.load(json_file)
+
+# use proxy if you needed
+set_proxy()
+
+# load pytorch model
+model = LayoutLMv2Model.from_pretrained("microsoft/layoutxlm-base")
+params_dict = model.state_dict()
+
+# conversion
+ms_params = []
+for name, value in params_dict.items():
+    each_param = dict()
+    each_param["name"] = param_name_map[name]
+    each_param["data"] = ms.Tensor(value.numpy())
+    ms_params.append(each_param)
+
+ms.save_checkpoint(ms_params, "layoutxlm-base.ckpt")
+
+unset_proxy()