-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcharacter_extractor.py
327 lines (255 loc) · 11 KB
/
character_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
import cv2
import base64
import pytesseract
import numpy as np
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\afogarty\AppData\Local\Tesseract-OCR\tesseract.exe"
# white text on black background
def prepare_b64_image(background, foreground):
'''
params:
background: string - base64 image as text
foreground: string - base64 image as text
'''
# bg
background = base64.b64decode(str(background))
background = np.frombuffer(background, np.uint8)
background = cv2.imdecode(background, cv2.IMREAD_UNCHANGED)
# fg
foreground = base64.b64decode(str(foreground))
foreground = np.frombuffer(foreground, np.uint8)
foreground = cv2.imdecode(foreground, cv2.IMREAD_UNCHANGED)
return background, foreground
def add_transparent_image(background, foreground, x_offset=None, y_offset=None):
'''
params:
background: array - image as np array
foreground: array - image as np array
x_offset: int - where to slide the background on the x axis
y_offset: int - where to slide the background on the y axis
'''
bg_h, bg_w, bg_channels = background.shape
fg_h, fg_w, fg_channels = foreground.shape
assert bg_channels == 3, f'background image should have exactly 3 channels (RGB). found:{bg_channels}'
assert fg_channels == 4, f'foreground image should have exactly 4 channels (RGBA). found:{fg_channels}'
w = min(fg_w, bg_w, fg_w + x_offset, bg_w - x_offset)
h = min(fg_h, bg_h, fg_h + y_offset, bg_h - y_offset)
# clip foreground and background images to the overlapping regions
bg_x = max(0, x_offset)
bg_y = max(0, y_offset)
fg_x = max(0, x_offset * -1)
fg_y = max(0, y_offset * -1)
# gen copies
fg, bg = foreground.copy(), background.copy()
fg = fg[fg_y:fg_y + h, fg_x:fg_x + w]
bg_subsection = bg[bg_y:bg_y + h, bg_x:bg_x + w]
# separate alpha and color channels from the foreground image
foreground_colors = fg[:, :, :3]
alpha_channel = fg[:, :, 3] / 255 # 0-255 => 0.0-1.0
# construct an alpha_mask that matches the image shape
alpha_mask = np.dstack((alpha_channel, alpha_channel, alpha_channel))
# combine the background with the overlay image weighted by alpha
composite = bg_subsection * (1 - alpha_mask) + foreground_colors * alpha_mask
# overwrite the section of the background image that has been updated
bg[bg_y:bg_y + h, bg_x:bg_x + w] = composite
# set to float32
composite = np.float32(composite)
return composite
def captcha_preprocessing(img, params):
'''
parameterized processing
params:
img: array - image as np array
params: dict - dictionary of parameterized options
'''
# gray
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype('uint8')
# blur
blurred = cv2.medianBlur(gray, params['blur'])
# thresh
thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, params['at_window'], params['c'])
# Find contours and remove small noise; this is an option to just get a single char or so
cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
area = cv2.contourArea(c)
if area < params['contour_area']:
cv2.drawContours(thresh, [c], -1, 0, -1)
# cv.MORPH_RECT, cv.MORPH_CROSS, cv.MORPH_ELLIPSE
kernel = cv2.getStructuringElement(params['morph_shape'], (params['e_l'], params['e_r']))
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
# another blur; not bad (5, 5), 3, 3)
blurred2 = cv2.GaussianBlur(opening, (params['b_l'], params['b_r']), params['s_x'], params['s_y'])
# crop edge noise
crop = remove_noise(im_array=blurred2, lower=5, upper=95)
# resize
resized = rescale_image(crop, params['resize'])
return resized
def remove_noise(im_array, lower, upper):
'''
Remove noise from image by cropping areas with few black pixels
'''
# loc of black pixels
indices = np.where(im_array == [0])[0] # y-axis
# get histogram over indices
y, x = np.histogram(indices, density=True)
# get percentiles
lower_p = np.percentile(indices, lower).astype(int)
upper_p = np.percentile(indices, upper).astype(int)
# crop image
crop = im_array[lower_p:upper_p:, :]
return crop
def rescale_image(im_array, percent):
'''
rescale image
'''
# percent of original size
width = int(im_array.shape[1] * percent / 100)
height = int(im_array.shape[0] * percent / 100)
dim = (width, height)
# resize image
resized = cv2.resize(im_array, dim, interpolation=cv2.INTER_NEAREST)
return resized
def ocr_sliding_image(max_offset, bg, fg, params):
# storage
storage_df = pd.DataFrame()
# prepare b64 image
background, foreground = prepare_b64_image(background=bg,
foreground=fg)
# loop
for i, offset in enumerate(range(0, max_offset, 1)):
'''
offset background with foreground and run pytesseract to determine
individual characters and positions for cropping
'''
# run image joining at offset
out = add_transparent_image(background=background,
foreground=foreground,
x_offset=0+offset,
y_offset=0)
try:
# run pre-processing
captcha_out = captcha_preprocessing(img=out, params=params)
except Exception as e:
print('exception triggered', e)
continue
# get conf
conf_out = pytesseract.image_to_data(captcha_out,
lang='eng',
config=params['config'],
output_type='data.frame')
# filter nan
conf_out = conf_out.loc[conf_out['text'].notna()].copy()
# go int if float
# but handle np.inf situation
conf_out.replace(np.inf, 0, inplace=True)
conf_out['text'] = conf_out['text'].map(lambda x: int(x) if isinstance(x, float) else x)
# set result to strings in case pandas converts
conf_out['text'] = conf_out['text'].astype(str)
# remoev periods
conf_out['text'] = conf_out['text'].str.replace('.', '', regex=False)
# strip white space
conf_out['text'] = conf_out['text'].str.strip()
# get lens
conf_out['len'] = conf_out['text'].map(lambda x: len(x))
# add in offset
conf_out['offset'] = offset
# add in filename
conf_out['bg_name'] = bg
conf_out['fg_name'] = fg
# extract results
conf_out = conf_out[['left', 'top', 'width', 'height', 'conf', 'text', 'offset', 'bg_name', 'fg_name', 'len']]
# filter to just len 1 characters
conf_out = conf_out.loc[conf_out['len'] == 1].reset_index(drop=True)
# and just good results
conf_out = conf_out.loc[conf_out['conf'] >= 70].reset_index(drop=True)
if len(conf_out) >= 1:
# concat
storage_df = pd.concat([storage_df, conf_out], axis=0)
# return max result
if len(storage_df) >= 1:
storage_df = storage_df.loc[storage_df['conf'] == storage_df['conf'].max()].reset_index(drop=True)
return storage_df
def extract_character(background, foreground, x, y, h, w, offset, resize, text, conf):
'''
Extract individual characters given pytesseract's findings
'''
# generate raw image
background, foreground = prepare_b64_image(background, foreground)
# set it to the high performing offset
extracting_image = add_transparent_image(background=background,
foreground=foreground,
x_offset=offset,
y_offset=0)
# set it to the expected resize
extracting_image_scaled = rescale_image(extracting_image, resize)
# crop image
crop = extracting_image_scaled[y:y+h, x:x+w]
# edit conf
conf = str(conf).replace('.', '')[0:8]
# gray
gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY)
# write to disk
cv2.imwrite(fr"C:\Users\afogarty\Desktop\extracted_chars\{text}_{conf}.png", gray)
def parallel_search_fn(bg_set, fg_set, params, max_workers):
# for each image in the dataset
futures = []
with ProcessPoolExecutor(max_workers=max_workers) as executor:
for bg, fg in zip(bg_set, fg_set):
future = executor.submit(ocr_sliding_image, max_offset=60, bg=bg, fg=fg, params=params)
futures.append(future)
# unpack results
print(f'Starting to unpack this many results: {len(futures)}')
for future in futures:
result = future.result()
if result is not None:
# extract chars
for i in range(len(result)):
extract_character(background=result.iloc[i]['bg_name'],
foreground=result.iloc[i]['fg_name'],
x=result.iloc[i]['left'],
y=result.iloc[i]['top'],
h=result.iloc[i]['height'],
w=result.iloc[i]['width'],
offset=result.iloc[i]['offset'],
resize=params['resize'],
text=result.iloc[i]['text'],
conf=result.iloc[i]['conf']
)
# # shapes
# shape1_ = cv2.MORPH_RECT
# shape2_ = cv2.MORPH_CROSS
# shape3_ = cv2.MORPH_ELLIPSE
if __name__ == '__main__':
# find max workers
_ = ProcessPoolExecutor()
max_workers = _._max_workers
del _
#
# sample set
captcha_set = pd.read_csv(r"C:\Users\afogarty\Desktop\captcha\dataset\raw_base64_dataset.csv")
bg_set = captcha_set['bg'].values
fg_set = captcha_set['fg'].values
# # set params
# # finished with loss -81321596.3750 and params dict_items([('at_window', 33), ('b_l', 23), ('b_r', 5), ('c', 33), ('config', "--psm 6 --oem 1 -c tessedit_char_whitelist=' ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'"), ('contour_area', 420), ('e_l', 2), ('e_r', 4), ('morph_shape', 2), ('resize', 340), ('s_x', 3), ('s_y', 4), ('workers', 20)])
params = {'at_window': 35,
'b_l': 15,
'b_r': 5,
'c': 29,
'blur': 5,
'config': '--psm 6 --oem 1 -c tessedit_char_whitelist=" ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"',
'contour_area': 450,
'e_l': 4,
'e_r': 4,
'morph_shape': cv2.MORPH_CROSS,
'resize': 170,
's_x': 2,
's_y': 1,
'max_offset': 50}
# run ops
print(f'Starting operations using this many workers: {max_workers}')
parallel_search_fn(bg_set, fg_set, params, max_workers)
# report conclusion
print('Operations finished!')
#