-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfile_utils.py
85 lines (66 loc) · 2.45 KB
/
file_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import magic
import os
import yaml
from bs4 import BeautifulSoup
from nltk import sent_tokenize
from typing import List # Add this import
def get_config(config_path="config.template.yaml", secrets_path="secrets.yaml"):
with open(config_path, "r") as config_file:
config = yaml.safe_load(config_file)
if os.path.exists(secrets_path):
with open(secrets_path, "r") as secrets_file:
secrets = yaml.safe_load(secrets_file)
config.update(secrets)
return config
def read_questions(file_path):
with open(file_path, 'r') as file:
questions = [line.strip() for line in file if line.strip()]
return questions
def read_text(path):
path = path.rstrip()
path = path.replace(' \n', '')
path = path.replace('%0A', '')
relative_path = path
filename = os.path.abspath(relative_path)
filetype = magic.from_file(filename, mime=True)
text = ""
if filetype == 'application/pdf':
print('PDF not supported yet')
if filetype == 'text/plain':
with open(filename, 'rb') as f:
text = f.read().decode('utf-8')
if filetype == 'text/html':
with open(filename, 'rb') as f:
soup = BeautifulSoup(f, 'html.parser')
text = soup.get_text()
if os.path.exists(filename) and filename.find('content/') > -1:
os.remove(filename)
return text
def chunk_text_by_sentences(source_text: str, sentences_per_chunk: int, overlap: int, language="english") -> List[str]:
"""
Splits text by sentences
"""
if sentences_per_chunk < 2:
raise ValueError(
"The number of sentences per chunk must be 2 or more.")
if overlap < 0 or overlap >= sentences_per_chunk - 1:
raise ValueError(
"Overlap must be 0 or more and less than the number of sentences per chunk.")
sentences = sent_tokenize(source_text, language=language)
if not sentences:
print("Nothing to chunk")
return []
chunks = []
i = 0
print(len(sentences))
while i < len(sentences):
end = min(i + sentences_per_chunk, len(sentences))
chunk = ' '.join(sentences[i:end])
if overlap > 0 and i > 1:
overlap_start = max(0, i - overlap)
overlap_end = i
overlap_chunk = ' '.join(sentences[overlap_start:overlap_end])
chunk = overlap_chunk + ' ' + chunk
chunks.append(chunk.strip())
i += sentences_per_chunk
return chunks