-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtreebank.py
69 lines (56 loc) · 1.67 KB
/
treebank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from nltk.corpus.reader import find_corpus_fileids
from nltk.data import FileSystemPathPointer
import glob
from nltk.tag import simplify_wsj_tag
from nltk.tree import Tree
badtokens = [".", ",", "``", "''", ":", "$", "-NONE-", "-RRB-", "-LRB-","#"]
"""
Checks for terminal node.
"""
def isterminal(tree):
return not isinstance(tree, Tree)
"""
Checks for parent of terminal non-terminal node.
"""
def ispreterminal(tree):
return not all(isinstance(c, Tree) for c in tree)
"""
Remove bad tokens in given sentences gold tree
"""
def nopunct(sent):
return Tree(sent.node, [nopunct(c) for c in sent if c.node not in badtokens ]) if not ispreterminal(sent) else sent
def noemptysubtree(sent):
if ispreterminal(sent):
return sent
else:
return Tree(sent.node, [noemptysubtree(c) for c in sent if len(c) > 0 ])
class TreeUtil:
@classmethod
def filteredcopy(cls, t):
return noemptysubtree(nopunct(t.copy(deep=True)))
@classmethod
def bracketing(cls, tree, leaves=True, root=True, unary=True):
"""Returns the set of unlabeled spannings.
"""
queue = tree.treepositions()
stack = [(queue.pop(0), 0)]
j = 0
result = set()
while stack != []:
(p, i) = stack[-1]
if queue == [] or queue[0][:-1] != p:
if isinstance(tree[p], Tree):
result.add((i, j))
else:
if leaves:
result.add((i, i + 1))
j = i + 1
stack.pop()
else:
q = queue.pop(0)
stack.append((q, j))
if not root:
result.remove((0, len(tree.leaves())))
if not unary:
result = set(filter(lambda (x, y): x != y - 1, result))
return result