-
Notifications
You must be signed in to change notification settings - Fork 2
/
untranslated_templates.py
165 lines (139 loc) · 5.82 KB
/
untranslated_templates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from re import compile, IGNORECASE, VERBOSE
from utils import pagescraper_queue, time_and_date, plural, whatlinkshere
from wikitools import wiki
verbose = False
LANGS = ['ar', 'cs', 'da', 'de', 'es', 'fi', 'fr', 'hu', 'it', 'ja', 'ko', 'nl', 'no', 'pl', 'pt', 'pt-br', 'ro', 'ru', 'sv', 'tr', 'zh-hans', 'zh-hant']
LANG_TEMPLATE_START = compile(r"""
[^{]{{ # The start of a template '{{' which is not the start of a parameter '{{{'
\s* # Any amount of whitespace is allowed before the template name
lang # Language template. Note that this does not allow invocations of {{lang incomplete}}
\s* # Any amount of whitespace (but critically, no more ascii characters)
\| # Start of parameter list
""", IGNORECASE | VERBOSE)
LANG_TEMPLATE_ARGS = compile(r"""
\| # Start of a parameter
(
[^|=]*? # Parameter
)
= # Start of a value
(
[^|]* # Value
)
""", VERBOSE)
def parse_lang_templates(page):
page_text = page.get_wiki_text()
# First, find the matching pairs
def get_indices(char, string):
index = -1
indices = []
while 1:
try:
index = string.index(char, index+1)
except ValueError:
break
indices.append(index)
return indices
locations = [[len(page_text), -1]]
for open in get_indices('{', page_text):
locations.append([open, 1])
for open in get_indices('[', page_text):
locations.append([open, 1])
for close in get_indices('}', page_text):
locations.append([close, -1])
for close in get_indices(']', page_text):
locations.append([close, -1])
locations.sort()
# Next, divide the text up based on those pairs. Embedded text is separated out, e.g. {a{b}c} will become "ac" and "b".
stack = [0]
buffer = {0: ''}
lastIndex = 0
for index, value in locations:
try:
buffer[stack[-1]] += page_text[lastIndex:index]
except KeyError:
buffer[stack[-1]] = page_text[lastIndex:index]
except IndexError:
buffer[0] += page_text[lastIndex:index] # Add text to default layer
stack.append(None) # So there's something to .pop()
if verbose:
print(page.title, 'Found a closing brace without a matched opening brace')
if value == 1:
stack.append(index)
elif value == -1:
stack.pop()
lastIndex = index + 1
if verbose:
print(page.title, 'contains', len(buffer), 'pairs of braces')
# Finally, search through for lang templates using regex
lang_templates = []
for match in LANG_TEMPLATE_START.finditer(page_text):
lang_template = []
for match2 in LANG_TEMPLATE_ARGS.finditer(buffer[match.start() + 2]): # Skip the opening {{
language = match2.group(1).strip().lower()
text = match2.group(2).strip()
lang_template.append((language, text))
# Add an identifier to the start of the data (line number + first language string)
location = "''Line %d'': <nowiki>%s</nowiki>" % (
page_text[:match.start()].count('\n') + 1,
lang_template[0][1].split('\n', 1)[0].strip() if len(lang_template) > 0 else '',
)
lang_template.insert(0, location)
lang_templates.append(lang_template)
return lang_templates
def pagescraper(page, translations, usage_counts):
lang_templates = parse_lang_templates(page)
missing_translations = {lang:[] for lang in LANGS}
for lang_template in lang_templates:
location = lang_template.pop(0)
missing_languages = set(LANGS)
for lang, _ in lang_template:
missing_languages.discard(lang)
for language in missing_languages:
missing_translations[language].append(location)
usage_count = page.get_transclusion_count()
if usage_count == 0:
return # Who cares, if it's not being used.
usage_counts[page.title] = usage_count
for lang, lang_missing_translations in missing_translations.items():
if len(lang_missing_translations) > 0:
translations[lang].append((page, lang_missing_translations))
def main(w):
translations = {lang: [] for lang in LANGS}
usage_counts = {}
with pagescraper_queue(pagescraper, translations, usage_counts) as pages:
for page in w.get_all_templates():
if '/' in page.title:
continue # Don't include subpage templates like Template:Dictionary and Template:PatchDiff
if page.title[:13] == 'Template:User':
continue # Don't include userboxes
if page.title == 'Template:Lang':
continue # Special exclusion
pages.put(page)
outputs = []
for language in LANGS:
output = """\
{{{{DISPLAYTITLE: {count} templates missing {{{{lang name|name|{lang}}}}} translation}}}}
Pages missing in {{{{lang info|{lang}}}}}: '''<onlyinclude>{count}</onlyinclude>''' in total. Data as of {date}.
; See also
* [[TFW:Reports/All articles/{lang}|All articles in {{{{lang name|name|{lang}}}}}]]
* [[TFW:Reports/Missing translations/{lang}|Missing article translations in {{{{lang name|name|{lang}}}}}]]
* [[Special:RecentChangesLinked/Project:Reports/All articles/{lang}|Recent changes to articles in {{{{lang name|name|{lang}}}}}]]
== List ==""".format(
lang=language,
count=len(translations[language]),
date=time_and_date())
for template, missing in sorted(translations[language], key=lambda elem: (-usage_counts[elem[0].title], elem[0].title)):
count = usage_counts[template.title]
output += f'\n# [{template.get_edit_url()} {template.title}] has [{whatlinkshere(template.title, count)} {plural.uses(count)}] and is missing {plural.translations(len(missing))}'
for location in missing:
output += f'\n#:{location}'
outputs.append([language, output])
return outputs
if __name__ == '__main__':
verbose = True
w = wiki.Wiki('https://wiki.teamfortress.com/w/api.php')
with open('wiki_untranslated_templates.txt', 'w') as f:
for lang, output in main(w):
f.write('\n===== %s =====\n' % lang)
f.write(output)
print(f'Article written to {f.name}')