-
Notifications
You must be signed in to change notification settings - Fork 0
/
database.py
97 lines (71 loc) · 3.5 KB
/
database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import re
import os
from langchain.text_splitter import CharacterTextSplitter
def langchain_splitter(text, max_tokens=1000):
text_splitter = CharacterTextSplitter(chunk_size=max_tokens, chunk_overlap=0)
articles = {f"Chunk {i + 1}": {"text": chunk} for i, chunk in enumerate(text_splitter.split_text(text))}
return articles
def txt_to_text(file_name):
with open(file_name, 'r') as f:
text = f.read()
return text
def split_law_document(file_name, use_langchain_splitter=False):
if use_langchain_splitter:
return langchain_splitter(text)
def process_article(prev_article_start, end_position, current_part):
text_part = text[prev_article_start:end_position]
text_part = text_part.split("___________")[0]
article_dicts = []
if len(text_part.split()) > 1000:
half_length = len(text_part) // 2
split_index = text_part[:half_length].rfind("\n")
text_part_1 = text_part[:split_index]
text_part_2 = text_part[split_index:]
article_dicts.extend([
{"text": text_part_1, "part_number": current_part["number"], "part_title": current_part["title"]},
{"text": text_part_2, "part_number": current_part["number"], "part_title": current_part["title"]}
])
else:
article_dicts.append({"text": text_part, "part_number": current_part["number"], "part_title": current_part["title"]})
return article_dicts
articles = {}
text = txt_to_text(file_name)
part_matches = re.finditer(r'^_{11}\n(.+?)\n(.+?)\n', text, re.MULTILINE)
article_matches = re.finditer(r'\n\s*((\d+[A-Z]?)[.]\s*(?:\t)?\s*.+?)\n', text)
current_part = {"number": "Part I", "title": "Preliminary"}
prev_article_key = prev_article_start = part_match = None
part_match = next(part_matches, None)
for match in article_matches:
article_key = match.group(1)
article_start = match.start()
if prev_article_start is not None:
end_position = article_start
article_dicts = process_article(prev_article_start, end_position, current_part)
for a in article_dicts:
articles[prev_article_key] = a
while part_match and part_match.start() < article_start:
current_part["number"] = part_match.group(1)
current_part["title"] = part_match.group(2)
part_match = next(part_matches, None)
prev_article_key = article_key
prev_article_start = article_start
if prev_article_start is not None:
end_position = part_match.start() if part_match else len(text)
article_dicts = process_article(prev_article_start, end_position, current_part)
for a in article_dicts:
articles[prev_article_key] = a
return articles
def loadOrdinance(use_langchain_splitter=False):
ordinances_path = "Ordinances"
processed, meta = [], []
for file in os.listdir(ordinances_path):
if file.endswith(".txt"):
file_path = os.path.join(ordinances_path, file)
articles = split_law_document(file_path, use_langchain_splitter)
for key, value in articles.items():
processed.append(value["text"])
if use_langchain_splitter:
meta.append({"Ordinance": file[:-4]})
else:
meta.append({"Article": key, "Part": value["part_number"], "Part_title": value["part_title"], "Ordinance": file[:-4]})
return processed, meta