forked from sharanya02/Text-file-to-handwritten-pdf-file
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScannedImages-to-PDF.py
149 lines (127 loc) · 4.15 KB
/
ScannedImages-to-PDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Usage
# python ScannedImages-to-PDF.py -i image_path
from PIL import Image
import pytesseract
import argparse
import cv2
import os
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True, help="path to input image to be OCR'd")
ap.add_argument(
"-p",
"--preprocess",
type=str,
default="thresh",
help="type of preprocessing to be done",
)
args = vars(ap.parse_args())
# load the example image and convert it to grayscale
image = cv2.imread(args["image"])
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# cv2.imshow("Image", gray)
# check to see if we should apply thresholding to preprocess the
# image
if args["preprocess"] == "thresh":
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# make a check to see if median blurring should be done to remove
# noise
elif args["preprocess"] == "blur":
gray = cv2.medianBlur(gray, 3)
# write the grayscale image to disk as a temporary file so we can
# apply OCR to it
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)
# load the image as a PIL/Pillow image, apply OCR, and then delete
# the temporary file
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
text = pytesseract.image_to_string(Image.open(filename))
os.remove(filename)
print(text.encode("utf-8"))
file = open("testfile.txt", "w+")
file.write(text)
file.close()
# show the output images
# cv2.imshow("Image", image)
# cv2.imshow("Output", gray)
cv2.waitKey(0)
print("------------------- processed OCR -------------------")
BG = Image.open("myfont\\bg.png")
sizeOfSheet = BG.width
gap, _ = 0, 0
allowedChars = "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM,.-?!() 1234567890"
def writee(char):
global gap, _
if char == "\n":
pass
else:
char.lower()
cases = Image.open("myfont/%s.png" % char)
BG.paste(cases, (gap, _))
size = cases.width
gap += size
del cases
def letterwrite(word):
global gap, _
if gap > sizeOfSheet - 95 * (len(word)):
gap = 0
_ += 200
for letter in word:
if letter in allowedChars:
if letter.islower():
pass
elif letter.isupper():
letter = letter.lower()
letter += "upper"
elif letter == ".":
letter = "fullstop"
elif letter == "!":
letter = "exclamation"
elif letter == "?":
letter = "question"
elif letter == ",":
letter = "comma"
elif letter == "(":
letter = "braketop"
elif letter == ")":
letter = "braketclose"
elif letter == "-":
letter = "hiphen"
writee(letter)
def worddd(Input):
wordlist = Input.split(" ")
for i in wordlist:
letterwrite(i)
writee("space")
if __name__ == "__main__":
try:
with open("testfile.txt", "r") as file:
data = file.read().replace("\n", "")
l = len(data)
nn = len(data) // 600
chunks, chunk_size = len(data), len(data) // (nn + 1)
p = [data[i : i + chunk_size] for i in range(0, chunks, chunk_size)]
for i in range(0, len(p)):
worddd(p[i])
writee("\n")
BG.save("myfont/%doutt.png" % i)
BG1 = Image.open("myfont/bg.png")
BG = BG1
gap = 0
_ = 0
except ValueError as E:
print("{}\nTry again".format(E))
# for conversion of img to pdf
from fpdf import FPDF
from PIL import Image
print("------------------- Converting to PDF -------------------")
imagelist = []
for i in range(0, len(p)):
imagelist.append("myfont/%doutt.png" % i)
cover = Image.open(imagelist[0])
width, height = cover.size
pdf = FPDF(unit="pt", format=[width, height])
for i in range(0, len(imagelist)):
pdf.add_page()
pdf.image(imagelist[i], 0, 0)
pdf.output("output.pdf", "F")