forked from mindspore-lab/mindocr
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix PP-OCRv3 DBNet and SVTR param maps (mindspore-lab#632)
- Loading branch information
1 parent
f50b4f3
commit f7e7f2d
Showing
24 changed files
with
9,251 additions
and
525 deletions.
There are no files selected for viewing
Submodule IDCardCamera
added at
52e863
Submodule IdCardOCR
added at
2b54c8
907 changes: 606 additions & 301 deletions
907
configs/det/dbnet/db_mobilenetv3_ppocrv3_param_map.json
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
system: | ||
mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore | ||
distribute: False | ||
amp_level: 'O0' | ||
seed: 42 | ||
# log_interval: 10 | ||
val_while_train: True | ||
drop_overflow_update: False | ||
|
||
model: | ||
type: kie | ||
transform: null | ||
backbone: | ||
name: layoutxlm_for_ser | ||
pretrained: True | ||
checkpoints: | ||
num_classes: &num_classes 7 | ||
|
||
postprocess: | ||
name: VQASerTokenLayoutLMPostProcess | ||
class_path: &class_path train_data/xfund/class_list_xfun.txt | ||
|
||
metric: | ||
name: VQASerTokenMetric | ||
main_indicator: hmean | ||
|
||
loss: | ||
name: VQASerTokenLayoutLMLoss | ||
num_classes: *num_classes | ||
key: "backbone_out" | ||
|
||
scheduler: | ||
scheduler: polynomial_decay | ||
lr: 0.00005 | ||
min_lr: 0.0000002 | ||
num_epochs: 200 | ||
warmup_epochs: 2 | ||
|
||
optimizer: | ||
opt: adam | ||
filter_bias_and_bn: False | ||
weight_decay: 0.0005 | ||
|
||
train: | ||
ckpt_save_dir: './tmp_kie' | ||
dataset_sink_mode: False | ||
dataset: | ||
type: KieDataset | ||
dataset_root: train_data | ||
data_dir: xfund/zh_train/image | ||
label_file: xfund/zh_train/train.json | ||
sample_ratio: 1.0 | ||
transform_pipeline: | ||
- DecodeImage: | ||
img_mode: RGB | ||
to_float32: False | ||
- VQATokenLabelEncode: | ||
contains_re: False | ||
algorithm: &algorithm LayoutXLM | ||
class_path: *class_path | ||
- VQATokenPad: | ||
max_seq_len: &max_seq_len 512 | ||
return_attention_mask: True | ||
- VQASerTokenChunk: | ||
max_seq_len: *max_seq_len | ||
- LayoutResize: | ||
size: [ 224, 224 ] | ||
- NormalizeImage: | ||
bgr_to_rgb: False | ||
is_hwc: True | ||
mean: imagenet | ||
std: imagenet | ||
- ToCHWImage: | ||
# the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visualize | ||
output_columns: [ 'input_ids', 'bbox','attention_mask','token_type_ids', 'image', 'labels' ] | ||
net_input_column_index: [ 0, 1, 2, 3, 4 ] # input indices for network forward func in output_columns | ||
label_column_index: [ 2, 5 ] # input indices marked as label | ||
|
||
loader: | ||
shuffle: True | ||
batch_size: 8 | ||
drop_remainder: True | ||
num_workers: 8 | ||
|
||
eval: | ||
ckpt_load_path: './tmp_kie/best.ckpt' | ||
dataset_sink_mode: False | ||
dataset: | ||
type: KieDataset | ||
dataset_root: train_data | ||
data_dir: xfund/zh_val/image | ||
label_file: xfund/zh_val/val.json | ||
sample_ratio: 1.0 | ||
shuffle: False | ||
transform_pipeline: | ||
- DecodeImage: | ||
img_mode: RGB | ||
to_float32: False | ||
- VQATokenLabelEncode: | ||
contains_re: False | ||
algorithm: *algorithm | ||
class_path: *class_path | ||
- VQATokenPad: | ||
max_seq_len: *max_seq_len | ||
return_attention_mask: True | ||
- VQASerTokenChunk: | ||
max_seq_len: *max_seq_len | ||
- LayoutResize: | ||
size: [ 224, 224 ] | ||
- NormalizeImage: | ||
bgr_to_rgb: False | ||
is_hwc: True | ||
mean: imagenet | ||
std: imagenet | ||
- ToCHWImage: | ||
# the order of the dataloader list, matching the network input and the labels for evaluation | ||
output_columns: [ 'input_ids', 'bbox', 'attention_mask','token_type_ids','image', 'labels' ] | ||
net_input_column_index: [ 0, 1, 2, 3, 4 ] # input indices for network forward func in output_columns | ||
label_column_index: [ 2, 5 ] # input indices marked as label | ||
# num_keys_of_labels: 2 # num labels | ||
|
||
loader: | ||
shuffle: False | ||
batch_size: 1 | ||
drop_remainder: False | ||
num_workers: 1 |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import mindspore as ms | ||
from mindspore import nn, ops | ||
|
||
__all__ = ["VQASerTokenLayoutLMLoss", "LossFromOutput"] | ||
|
||
|
||
class VQASerTokenLayoutLMLoss(nn.LossBase): | ||
def __init__(self, num_classes, key=None): | ||
super().__init__() | ||
self.loss_class = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="none") | ||
self.num_classes = num_classes | ||
self.key = key | ||
|
||
def construct(self, predicts, attention_mask, labels): | ||
if isinstance(predicts, dict) and self.key is not None: | ||
predicts = predicts[self.key] | ||
if attention_mask is not None: | ||
|
||
loss = self.loss_class(predicts.reshape((-1, self.num_classes)).astype(ms.float32), | ||
labels.reshape((-1,)).astype(ms.int32)) | ||
attention_mask = attention_mask.reshape((-1,)) | ||
loss = ops.mul(loss, attention_mask) | ||
loss = loss[loss > 0] | ||
else: | ||
loss = self.loss_class(predicts.reshape((-1, self.num_classes)).astype(ms.float32), | ||
labels.reshape((-1,)).astype(ms.int32)) | ||
return ops.reduce_mean(loss) | ||
|
||
|
||
class LossFromOutput(nn.LossBase): | ||
def __init__(self, key="loss", reduction="none"): | ||
super().__init__() | ||
self.key = key | ||
self.reduction = reduction | ||
|
||
def construct(self, predicts, batch): | ||
loss = predicts | ||
if self.key is not None and isinstance(predicts, dict): | ||
loss = loss[self.key] | ||
if self.reduction == "mean": | ||
loss = ops.mean(loss) | ||
elif self.reduction == "sum": | ||
loss = ops.sum(loss) | ||
return loss |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from .configuration import LayoutXLMPretrainedConfig | ||
from .layoutxlm import LayoutXLMModel, layoutxlm_for_re, layoutxlm_for_ser | ||
from .tokenizer import LayoutXLMTokenizer | ||
|
||
__all__ = ["LayoutXLMModel", "LayoutXLMPretrainedConfig", | ||
"LayoutXLMTokenizer", "layoutxlm_for_ser", "layoutxlm_for_re"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from dataclasses import dataclass | ||
|
||
|
||
@dataclass | ||
class LayoutXLMPretrainedConfig: | ||
attention_probs_dropout_prob = 0.1 | ||
bos_token_id = 0 | ||
coordinate_size = 128 | ||
eos_token_id = 2 | ||
fast_qkv = False | ||
gradient_checkpointing = False | ||
has_relative_attention_bias = False | ||
has_spatial_attention_bias = False | ||
has_visual_segment_embedding = True | ||
hidden_act = "gelu" | ||
hidden_dropout_prob = 0.1 | ||
hidden_size = 768 | ||
image_feature_pool_shape = [7, 7, 256] | ||
initializer_range = 0.02 | ||
intermediate_size = 3072 | ||
layer_norm_eps = 1e-05 | ||
max_2d_position_embeddings = 1024 | ||
max_position_embeddings = 514 | ||
max_rel_2d_pos = 256 | ||
max_rel_pos = 128 | ||
model_type = "layoutxlm" | ||
num_attention_heads = 12 | ||
num_hidden_layers = 12 | ||
output_past = True | ||
pad_token_id = 1 | ||
shape_size = 128 | ||
rel_2d_pos_bins = 64 | ||
rel_pos_bins = 32 | ||
type_vocab_size = 1 | ||
vocab_size = 250002 |
42 changes: 42 additions & 0 deletions
42
mindocr/models/backbones/layout_xlm_ms/conversion_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# convert pytorch model to mindspore | ||
import json | ||
import os | ||
|
||
from transformers import LayoutLMv2Model | ||
|
||
import mindspore as ms | ||
|
||
|
||
def set_proxy(): | ||
proxy_addr = "http://127.0.0.1:7078" # your proxy addr | ||
os.environ['http_proxy'] = proxy_addr | ||
os.environ['https_proxy'] = proxy_addr | ||
|
||
|
||
def unset_proxy(): | ||
os.environ.pop("http_proxy") | ||
os.environ.pop("https_proxy") | ||
|
||
|
||
# load param_map json | ||
with open("param_map.json", "r") as json_file: | ||
param_name_map = json.load(json_file) | ||
|
||
# use proxy if you needed | ||
set_proxy() | ||
|
||
# load pytorch model | ||
model = LayoutLMv2Model.from_pretrained("microsoft/layoutxlm-base") | ||
params_dict = model.state_dict() | ||
|
||
# conversion | ||
ms_params = [] | ||
for name, value in params_dict.items(): | ||
each_param = dict() | ||
each_param["name"] = param_name_map[name] | ||
each_param["data"] = ms.Tensor(value.numpy()) | ||
ms_params.append(each_param) | ||
|
||
ms.save_checkpoint(ms_params, "layoutxlm-base.ckpt") | ||
|
||
unset_proxy() |
Oops, something went wrong.