-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenCount.py
executable file
·54 lines (47 loc) · 1.44 KB
/
tokenCount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/python3 -W all
"""
tokenCount.py: count the average number of chars, tokens and sents per label
usage: tokenCount.py < file
20180516 erikt(at)xs4all.nl
"""
import re
import sys
def getLabel(line):
tokens = line.split()
label = tokens.pop(0)
line = " ".join(tokens)
return(label,line)
def countChars(line):
return(len(line))
def countTokens(line):
tokens = line.split()
return(len(tokens))
def countSents(line):
tokens = line.split()
nbrOfSents = 0
for i in range(0,len(tokens)):
if re.search(r"^[.!?]+$",tokens[i]) or i == len(tokens)-1:
nbrOfSents += 1
return(nbrOfSents)
def main(argv):
nbrOfChars = {}
nbrOfTokens = {}
nbrOfSents = {}
nbrOfLabels = {}
for line in sys.stdin:
label, line = getLabel(line)
if not label in nbrOfLabels:
nbrOfChars[label] = 0
nbrOfTokens[label] = 0
nbrOfSents[label] = 0
nbrOfLabels[label] = 0
nbrOfChars[label] += countChars(line)
nbrOfTokens[label] += countTokens(line)
nbrOfSents[label] += countSents(line)
nbrOfLabels[label] += 1
for label in nbrOfLabels:
print(label,str(int(nbrOfSents[label]/nbrOfLabels[label])), \
str(int(nbrOfTokens[label]/nbrOfLabels[label])), \
str(int(nbrOfChars[label]/nbrOfLabels[label])))
if __name__ == "__main__":
sys.exit(main(sys.argv))