Skip to content

Commit

Permalink
Truncate repetitions, dump out result as ascii
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Feb 15, 2024
1 parent 0b1e179 commit f730955
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 2 deletions.
2 changes: 1 addition & 1 deletion detect_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def main():
predictions_by_page[name].append(pred)

with open(os.path.join(result_path, "results.json"), "w+") as f:
json.dump(predictions_by_page, f)
json.dump(predictions_by_page, f, ensure_ascii=False)

print(f"Wrote results to {result_path}")

Expand Down
2 changes: 1 addition & 1 deletion ocr_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def main():
page_image.save(os.path.join(result_path, f"{name}_{idx}_text.png"))

with open(os.path.join(result_path, "results.json"), "w+") as f:
json.dump(predictions_by_image, f)
json.dump(predictions_by_image, f, ensure_ascii=False)

print(f"Wrote results to {result_path}")

Expand Down
4 changes: 4 additions & 0 deletions surya/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from surya.detection import batch_detection
from surya.input.processing import slice_polys_from_image, slice_bboxes_from_image
from surya.postprocessing.text import truncate_repetitions
from surya.recognition import batch_recognition


Expand Down Expand Up @@ -73,6 +74,9 @@ def run_ocr(images: List[Image.Image], langs: List[List[str]], det_model, det_pr
slice_start = slice_end

assert len(image_lines) == len(det_pred["polygons"]) == len(det_pred["bboxes"])

# Remove repeated characters
image_lines = [truncate_repetitions(l) for l in image_lines]
predictions_by_image.append({
"text_lines": image_lines,
"polys": det_pred["polygons"],
Expand Down
31 changes: 31 additions & 0 deletions surya/postprocessing/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,37 @@
from surya.settings import settings


def truncate_repetitions(text: str, min_len=15):
# From nougat, with some cleanup
if len(text) < 2 * min_len:
return text

# try to find a length at which the tail is repeating
max_rep_len = None
for rep_len in range(min_len, int(len(text) / 2)):
# check if there is a repetition at the end
same = True
for i in range(0, rep_len):
if text[len(text) - rep_len - i - 1] != text[len(text) - i - 1]:
same = False
break

if same:
max_rep_len = rep_len

if max_rep_len is None:
return text

lcs = text[-max_rep_len:]

# remove all but the last repetition
text_to_truncate = text
while text_to_truncate.endswith(lcs):
text_to_truncate = text_to_truncate[:-max_rep_len]

return text[:len(text_to_truncate)]


def get_text_size(text, font):
im = Image.new(mode="P", size=(0, 0))
draw = ImageDraw.Draw(im)
Expand Down

0 comments on commit f730955

Please sign in to comment.