-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRadixTree.py
205 lines (171 loc) · 8.2 KB
/
RadixTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
from typing import Any
Char = str
class RadixTree:
TERMINAL_CHAR = chr(ord("a") - 1)
@staticmethod
def cut_strings_at_diverging_index(string1: str, string2: str) -> tuple[str, str, str]:
# return tuple[common_prefix, string1_suffix, string2_suffix]
index = 0
index_max = min(len(string1), len(string2)) - 1
while index <= index_max and string1[index] == string2[index]:
index += 1
return string1[0: index], string1[index:], string2[index:]
def __init__(self, label: str = ""):
# 我们要求 root 节点的 label 永远为空串, 所以当树中只有一个word的时候, 树有两个 node
self.label = label # path leading to this node
self.full_word: str | None = None # 仅当是叶子节点时有意义
self.full_word_count: int = 0 # 仅当是叶子节点时有意义
self.full_word_attachment: Any = None # 仅当是叶子节点时有意义
self.children: dict[Char, RadixTree] = {} # maps characters to nodes
def is_terminal(self) -> int:
# if self is terminal, assert len(self.children) == 0 and self is not the root
return len(self.label) > 0 and self.label[-1] == self.TERMINAL_CHAR
def insert(self, word: str, word_attachment: Any = None) -> None:
word += self.TERMINAL_CHAR
word_suffix = word
node = self
while True:
child: RadixTree | None = node.children.get(word_suffix[0], None)
if child is None:
new_leaf: RadixTree = RadixTree(label=word_suffix)
new_leaf.full_word = word[:-1]
new_leaf.full_word_count = 1
new_leaf.full_word_attachment = word_attachment
node.children[word_suffix[0]] = new_leaf
return
common_prefix, string1_suffix, string2_suffix = \
self.cut_strings_at_diverging_index(child.label, word_suffix)
assert len(common_prefix) > 0
if string1_suffix == string2_suffix == "":
assert common_prefix[-1] == self.TERMINAL_CHAR
child.full_word_count += 1
child.full_word_attachment = word_attachment
return
elif string1_suffix == "" and string2_suffix != "":
assert common_prefix[-1] != self.TERMINAL_CHAR
word_suffix = string2_suffix
node = child
elif string1_suffix != "" and string2_suffix == "": # 这种情况不存在
pass
else: # string1_suffix != "" and string2_suffix != ""
new_internal_node: RadixTree = RadixTree(label=common_prefix)
child.label = string1_suffix
new_leaf: RadixTree = RadixTree(label=string2_suffix)
new_leaf.full_word = word[:-1]
new_leaf.full_word_count = 1
new_leaf.full_word_attachment = word_attachment
new_internal_node.children[string1_suffix[0]] = child
new_internal_node.children[string2_suffix[0]] = new_leaf
node.children[common_prefix[0]] = new_internal_node
return
def delete(self, word: str) -> None:
word += self.TERMINAL_CHAR
word_suffix = word
node = self
parent_nodes_to_update: list[RadixTree] = [] # root 在这个栈里, 但是 root node 永远不会被 delete
paths_leading_to_child_node: list[str] = []
while True:
child: RadixTree | None = node.children.get(word_suffix[0], None)
if child is None:
raise "word not exists"
parent_nodes_to_update.append(node)
paths_leading_to_child_node.append(child.label)
common_prefix, string1_suffix, string2_suffix = \
self.cut_strings_at_diverging_index(child.label, word_suffix)
assert len(common_prefix) > 0
if string1_suffix == string2_suffix == "":
assert common_prefix[-1] == self.TERMINAL_CHAR
if child.full_word_count == 1:
leaf_is_kept = False
else:
leaf_is_kept = True
child.full_word_count -= 1
break
elif string1_suffix == "" and string2_suffix != "":
assert common_prefix[-1] != self.TERMINAL_CHAR
word_suffix = string2_suffix
node = child
elif string1_suffix != "" and string2_suffix == "": # 这种情况不存在
pass
else: # string1_suffix != "" and string2_suffix != ""
raise "word not exists"
if not leaf_is_kept:
node: RadixTree = parent_nodes_to_update.pop()
path_leading_to_child: str = paths_leading_to_child_node.pop()
# print("deleting path", path_leading_to_child)
del node.children[path_leading_to_child[0]] # 一个path只指向一个node
if len(node.children) == 1 and len(parent_nodes_to_update) > 0:
# 压缩
# leaf node len(root.children) == 0
# internal node 中只有 root node len(root.children) 可以等于[0, 26+1]
# 其他 internal node len(root.children) 等于 [2, 26+1]
(char, grand_child) = node.children.popitem()
parent: RadixTree = parent_nodes_to_update.pop()
grand_child.label = node.label + grand_child.label
parent.children[node.label[0]] = grand_child
def search(self, word: str) -> tuple[bool, Any]:
# return tuple[found_or_note, word_attachment]
word += self.TERMINAL_CHAR
word_suffix = word
node = self
while True:
child: RadixTree | None = node.children.get(word_suffix[0], None)
if child is None:
return False, None
common_prefix, string1_suffix, string2_suffix = \
self.cut_strings_at_diverging_index(child.label, word_suffix)
assert len(common_prefix) > 0
if string1_suffix == string2_suffix == "":
assert common_prefix[-1] == self.TERMINAL_CHAR
return True, child.full_word_attachment
elif string1_suffix == "" and string2_suffix != "":
assert common_prefix[-1] != self.TERMINAL_CHAR
word_suffix = string2_suffix
node = child
elif string1_suffix != "" and string2_suffix == "": # 这种情况不存在
pass
else: # string1_suffix != "" and string2_suffix != ""
return False, None
def contains_prefix(self, prefix: str) -> bool:
word_suffix = prefix
node = self
if word_suffix == "":
return len(node.children) > 0 # node is the root
while True:
child: RadixTree | None = node.children.get(word_suffix[0], None)
if child is None:
return False
common_prefix, string1_suffix, string2_suffix = \
self.cut_strings_at_diverging_index(child.label, word_suffix)
assert len(common_prefix) > 0
if string1_suffix == string2_suffix == "":
# assert child is an internal node, child can not be the root
return True
elif string1_suffix == "" and string2_suffix != "":
word_suffix = string2_suffix
node = child
elif string1_suffix != "" and string2_suffix == "":
return True
else: # string1_suffix != "" and string2_suffix != ""
return False
# https://www.cs.jhu.edu/~langmea/resources/lecture_notes/suffix_trees.pdf
# https://nbviewer.org/gist/BenLangmead/6665861
if __name__ == "__main__":
t = RadixTree()
print("======")
t.insert("hello")
print(t.search(""))
print(t.contains_prefix(""))
print("==============")
t.insert("")
print(t.search(""))
print(t.contains_prefix(""))
print("============================")
t.delete("")
print(t.search(""))
print(t.contains_prefix(""))
print("======")
t.delete("hello")
print(t.search(""))
print(t.contains_prefix(""))
print("================")