-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcky.py
143 lines (123 loc) · 5.18 KB
/
cky.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from pprint import pprint
# The productions rules have to be binarized.
grammar_text = """
S -> NPZ VP
S -> NP VBZ
NP -> Det Noun
NPZ -> Det Nouns
VP -> Verb NP
VBZ -> Verbs NP
PP -> Prep NP
NP -> NP PP
VP -> VP PP
"""
lexicon = {
'Nouns': set(['cats', 'dogs']),
'Verbs': set(['attacked','attacks']),
'Noun': set(['cat', 'dog', 'table', 'food']),
'Verb': set([ 'saw', 'loved', 'hated', 'attack']),
'Prep': set(['in', 'of', 'on', 'with']),
'Det': set(['the', 'a']),
}
# Process the grammar rules. You should not have to change this.
grammar_rules = []
for line in grammar_text.strip().split("\n"):
if not line.strip(): continue
left, right = line.split("->")
left = left.strip()
children = right.split()
rule = (left, tuple(children))
grammar_rules.append(rule)
possible_parents_for_children = {}
for parent, (leftchild, rightchild) in grammar_rules:
if (leftchild, rightchild) not in possible_parents_for_children:
possible_parents_for_children[leftchild, rightchild] = []
possible_parents_for_children[leftchild, rightchild].append(parent)
# Error checking
all_parents = set(x[0] for x in grammar_rules) | set(lexicon.keys())
for par, (leftchild, rightchild) in grammar_rules:
if leftchild not in all_parents:
assert False, "Nonterminal %s does not appear as parent of prod rule, nor in lexicon." % leftchild
if rightchild not in all_parents:
assert False, "Nonterminal %s does not appear as parent of prod rule, nor in lexicon." % rightchild
#print "Grammar rules in tuple form:"
#pprint(grammar_rules)
#print "Rule parents indexed by children:"
#pprint(possible_parents_for_children)
def cky_acceptance(sentence):
# return True or False depending whether the sentence is parseable by the grammar.
global grammar_rules, lexicon
# Set up the cells data structure.
# It is intended that the cell indexed by (i,j)
# refers to the span, in python notation, sentence[i:j],
# which is start-inclusive, end-exclusive, which means it includes tokens
# at indexes i, i+1, ... j-1.
# So sentence[3:4] is the 3rd word, and sentence[3:6] is a 3-length phrase,
# at indexes 3, 4, and 5.
# Each cell would then contain a list of possible nonterminal symbols for that span.
# If you want, feel free to use a totally different data structure.
N = len(sentence)
cells = {}
for i in range(N):
for j in range(i + 1, N + 1):
cells[(i, j)] = []
# TODO replace the below with an implementation
# First working on the Dialonal cells
for i in range(len(sentence)):
for l in lexicon:
if sentence[i] in lexicon[l]:
cells[i, i + 1].append(l)
for j in range(2, len(sentence) + 1):
for i in range(len(sentence) - j + 1):
m = i + j
while m - 1 > i:
for rule in cells[i, m - 1]:
for other_rule in cells[m - 1, i + j]:
if (rule, other_rule) in possible_parents_for_children:
new_rule = possible_parents_for_children[(rule, other_rule)]
cells[i, i + j].append(new_rule[0])
m = m - 1
pprint(cells)
return 'S' in cells[0, len(sentence)]
def cky_parse(sentence):
# Return one of the legal parses for the sentence.
# If nothing is legal, return None.
# This will be similar to cky_acceptance(), except with backpointers.
global grammar_rules, lexicon
N = len(sentence)
cells = {}
back_trace = {}
for i in range(N):
for j in range(i + 1, N + 1):
cells[(i, j)] = []
# TODO replace the below with an implementation
for i in range(len(sentence)):
for l in lexicon:
if sentence[i] in lexicon[l]:
cells[i, i + 1].append([l, sentence[i]])
for j in range(2, len(sentence) + 1):
for i in range(len(sentence) - j + 1):
m = i + j
while m - 1 > i:
for r,rule in enumerate(cells[i, m - 1]):
for o_r, other_rule in enumerate(cells[m - 1, i + j]):
#print(rule[0], other_rule[0])
if (rule[0], other_rule[0]) in possible_parents_for_children:
new_rule = possible_parents_for_children[rule[0], other_rule[0]]
cells[i, i + j].append([new_rule[0], [rule, other_rule]])
m = m - 1
#pprint(cells)
for rule in cells[0, len(sentence)]:
if 'S' == rule[0]:
return rule
return "Rejected"
## some examples of calling these things...
## you probably want to call only one sentence at a time to help debug more easily.
# print cky_acceptance(['the','cat','attacked','the','food'])
# pprint( cky_parse(['the','cat','attacked','the','food']))
# pprint( cky_acceptance(['the','the']))
# pprint( cky_parse(['the','the']))
# print cky_acceptance(['the','cat','attacked','the','food','with','a','dog'])
# pprint( cky_parse(['the','cat','attacked','the','food','with','a','dog']) )
# pprint( cky_parse(['the','cat','with','a','table','attacked','the','food']) )
#