-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfix_special_characters.py
56 lines (43 loc) · 1.31 KB
/
fix_special_characters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Replace smart quotes, long dashes, and other special characters that we don't want in the source.
"""
# TBD lowpri: make this a pre-commit hook on the solution files (before generating instructions)
import re
import argparse
import sys
special_re = re.compile(r"([^\x00-\x7f])")
REPLACEMENTS = {
"—": "-",
"’": "'",
"“": '"',
"”": '"',
"…": "...",
}
def fix(text: str) -> tuple[str, set[str]]:
unrecognized: set[str] = set()
n_fixed = 0
def replace_fn(m: re.Match):
global n_fixed
c = m.group(0)
fixed = REPLACEMENTS.get(c)
if fixed is None:
unrecognized.add(c)
return c
n_fixed += 1
return fixed
fixed_text = special_re.sub(replace_fn, text)
if unrecognized:
print("Failed to fix: ", unrecognized)
if n_fixed > 0:
print(f"{n_fixed} fixes made.")
return fixed_text, unrecognized
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("input_filename")
args = parser.parse_args()
with open(args.input_filename, "r", encoding="utf-8") as f:
text = f.read()
fixed_text, unrecognized = fix(text)
with open(args.input_filename, "w", encoding="utf-8") as f:
f.write(fixed_text)
sys.exit(1 if unrecognized else 0)