forked from AlibabaResearch/AdvancedLiterateMachinery
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwrite_synthtext_pyarrow.py
58 lines (44 loc) · 1.37 KB
/
write_synthtext_pyarrow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import json
import os
import pandas as pd
import pyarrow as pa
import scipy.io as sio
import random
from tqdm import tqdm
from glob import glob
from collections import defaultdict
dataset_root = './data'
gts = sio.loadmat(f"{dataset_root}/SynthText/gt.mat")
img_names = gts['imnames'][0]
bs = []
for i in tqdm(range(len(img_names))):
if i > 1000:
break
img_name = img_names[i][0]
word_ann = []
txt_ann = gts['txt'][0][i]
for j in range(len(txt_ann)):
bbox_ann = txt_ann[j].split('\n')
for k in range(len(bbox_ann)):
word_ann.extend(bbox_ann[k].strip().split(' '))
word_ann = ' '.join(word_ann)
# 4% for val
if random.uniform(0,1) > 0.04:
split = 'train'
else:
split = 'val'
with open(f"{dataset_root}/SynthText/{img_name}", "rb") as fp:
binary = fp.read()
bs.append([binary, [word_ann], img_name, split])
for split in ["train", "val"]:
batches = [b for b in bs if b[-1] == split]
dataframe = pd.DataFrame(
batches, columns=["image", "caption", "image_id", "split"],
)
table = pa.Table.from_pandas(dataframe)
os.makedirs(dataset_root, exist_ok=True)
with pa.OSFile(
f"{dataset_root}/synthtext_{split}.arrow", "wb"
) as sink:
with pa.RecordBatchFileWriter(sink, table.schema) as writer:
writer.write_table(table)